# Sentiment Analysis

In [5]:
import keras
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
#from keras.utils.np_utils import to_categorical
import re

In [None]:
!pip install keras

In [8]:
data = pd.read_excel('new_data.xlsx')
data = data[['F','A']]
#data = data[data.sentiment != "Neutral"]
print('Number of positive reviews: {}'.format((data['A'] == 4).sum()))
print('Number of negative reviews: {}'.format((data['A'] == 0).sum()))

Number of positive reviews: 1000
Number of negative reviews: 1004


### Data cleaning

In [14]:
labels_dict = {0: 'Negative', 4: 'Positive'}
def convert_labels(label):
    return labels_dict[label]

data.A = data.A.apply(lambda x: convert_labels(x))
data.head()

Unnamed: 0,F,A
0,@MsilltempeRED ...thanks 4 the b-day wishes sw...,Negative
1,@kikialakiki me too! and got socks on,Negative
2,must stop watching the machinist as i fall asl...,Negative
3,Proper want to be on LiveLounge one day,Negative
4,FREE Fourstar jacket courtesy of Spike Jonze! ...,Negative


In [27]:
## arguments
padding_type = 'pre'
truncating_type = 'post'
padding_value = 0

data['F'] = data['F'].apply(lambda x: x.lower())
data['F'] = data['F'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['F'].values)
X = tokenizer.texts_to_sequences(data['F'].values)

X = pad_sequences(X, padding=padding_type, truncating=truncating_type, value=padding_value)
token_maxlen = len(X[0])
print(X[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0 1830   15   33    6
   46 1831   14]


In [28]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length= X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout = 0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 31, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 31, 128)           0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511194 (1.95 MB)
Trainable params: 511194 (1.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [29]:
Y = pd.get_dummies(data['A']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1603, 31) (1603, 2)
(401, 31) (401, 2)


In [31]:
print(X_train[100])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0  240   30   36 1667  206  164   36   12  103   81  453   13  593
  450  220   81]


In [30]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 1, validation_data = (X_test, Y_test))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x7d6c98c3a110>

In [33]:
## save model
import pickle

model.save('model_sentiment_v1.h5')
config = {'tokenizer': {'tokenizer': tokenizer, 'token_maxlen': token_maxlen, 'padding_type': padding_type,
                       'padding_value': padding_value, 'truncating_type': truncating_type}}
pickle.dump(config, file = open('config.pkl', 'wb'))

In [34]:
import pickle
import numpy as np
import keras
from keras.preprocessing.sequence import pad_sequences

model = keras.models.load_model('model_sentiment_v1.h5')
config = pickle.load(open('config.pkl', 'rb'))

In [39]:
def predict_sentiment(text, model, config):
    text = [text] #if type(text) == np.str else text
    tokenizer = config['tokenizer']['tokenizer']
    text = tokenizer.texts_to_sequences(text)
    text = pad_sequences(text, maxlen     = config['tokenizer']['token_maxlen'],
                               padding    = config['tokenizer']['padding_type'],
                               truncating = config['tokenizer']['truncating_type'],
                               value      = config['tokenizer']['padding_value'])
    sentiment = model.predict(text,batch_size=1,verbose = 0)[0]
    argmax_sent = np.argmax(sentiment)
    sentiment_text  = 'Positive' if  argmax_sent == 1 else 'Negative'
    sentiment_score = sentiment[argmax_sent]
    return((sentiment_text, sentiment_score))

In [40]:
text = "I can't love someone more than you"
predict_sentiment(text, model, config)

('Positive', 0.96829)