In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
data = pd.read_csv("sentiment22-final.csv")
data.head()

In [None]:
classes = data['Sentiments']
print(classes.value_counts())

In [None]:
data.shape

In [None]:
data = data[['User_Review', 'Sentiments']]
data.head()

In [None]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub('\n', '', text)
    return text

In [None]:

data['User_Review'] = data.User_Review.apply(lambda x : clean_train_data(x))
data.head()

In [None]:
all_cat_data = data.copy()

# 2 Class Analysis

In [None]:
data = data[data['Sentiments'] != 'Neutral']
data.head()

In [None]:
print(len(data[data['Sentiments'] == 'Positive']))
print(len(data[ data['Sentiments'] == 'Negative']))
print(len(data[ data['Sentiments'] == 'Neutral']))

In [None]:
model1_data = data.copy()

In [None]:
max_features = 2000
token = Tokenizer(num_words=max_features, split = ' ')
token.fit_on_texts(data['User_Review'].values)

X = token.texts_to_sequences(data['User_Review'].values)
X = pad_sequences(X)

In [None]:
X.shape

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
Y = pd.get_dummies(data['Sentiments']).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, random_state=42)

In [None]:
batch_size = 32
history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=2)

In [None]:
# score = model.predict(X_test)
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=2)
print('score', score)
print('accuracy', acc)

In [None]:
text = ['i would recommend it if you have no other options']
text = token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=28, dtype='int32', value=0)
res = model.predict(text, batch_size=1,verbose = 2)
res

In [None]:
if np.argmax(res[0]) == 0:
    print("Negative Comment")
elif np.argmax(res[0]) == 1:
    print("Positive Comment")

# Multiclass Sentiment Analysis

### data preprocessing

In [None]:
ms_data = all_cat_data.copy()

In [None]:
ms_data.head()

In [None]:
num_of_rows = 4000
shuffled = ms_data.reindex(np.random.permutation(ms_data.index))
nt = shuffled[shuffled['Sentiments'] == 'Neutral'][:num_of_rows]
ng = shuffled[shuffled['Sentiments'] == 'Negative'][:num_of_rows]
ps = shuffled[shuffled['Sentiments'] == 'Positive'][:num_of_rows]
combine_data = pd.concat([nt, ng, ps], ignore_index=True)
combine_data = combine_data.reindex(np.random.permutation(combine_data.index))
combine_data['label'] = 0
combine_data.head()

In [None]:
classes = data['Sentiments']
print(classes.value_counts())

In [None]:
print(len(combine_data[combine_data['Sentiments'] == 'Neutral']))
print(len(combine_data[combine_data['Sentiments'] == 'Negative']))
print(len(combine_data[combine_data['Sentiments'] == 'Positive']))

In [None]:
ms_data = combine_data.copy()

In [None]:
print(len(ms_data[ms_data['Sentiments'] == 'Neutral']))
print(len(ms_data[ms_data['Sentiments'] == 'Negative']))
print(len(ms_data[ms_data['Sentiments'] == 'Positive']))

In [None]:
ms_data.loc[ms_data['Sentiments'] == 'Neutral', 'label'] = 0
ms_data.loc[ms_data['Sentiments'] == 'Negative', 'label'] = 1
ms_data.loc[ms_data['Sentiments'] == 'Positive', 'label'] = 2

In [None]:
ms_data.head(10)

In [None]:
from keras.utils import to_categorical

In [None]:
labels = to_categorical(ms_data['label'], num_classes=3)

In [None]:
labels.shape

In [None]:
labels[:10]

In [None]:
max_features = 3000
max_len = 130
ms_token = Tokenizer(num_words=max_features)
ms_token.fit_on_texts(ms_data['User_Review'].values)
ms_sequences = ms_token.texts_to_sequences(ms_data['User_Review'].values)
X = pad_sequences(ms_sequences, maxlen=max_len)

In [None]:
word_index = ms_token.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
y = labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [None]:
embed_dim = 128
lstm_out = 96

In [None]:
ms_model = Sequential()
ms_model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
ms_model.add(SpatialDropout1D(0.7))
ms_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
ms_model.add(Dense(3, activation='softmax'))
ms_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

ms_model.summary()

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
batch_size = 50
ms_history = ms_model.fit(X_train, y_train, epochs=20, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [None]:
from sklearn.metrics import precision_score


In [None]:
loss, accuracy = ms_model.evaluate(X_test, y_test)
print("loss", loss)
print("accuracy", accuracy)
