
Sentiment Analysis is the process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer's attitude towards a particular topic, product, etc. is positive, negative, or neutral.


Using LSTM to classify Assamese text into positive, negative or neutral.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
#
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping



Only keeping the necessary columns.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/06_assamese_sentiment_data.csv')
data = data[['text','sentiment']]

In [None]:
sentiment_counts = data['sentiment'].value_counts()

print("Exact counts of each sentiment in the main CSV file:")
print(sentiment_counts)

Data preview

In [None]:
data.head()

here we define the number of max features as 5000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.

In [None]:
##data = data[data.sentiment != "Neutral"]
#data['text'] = data['text'].apply(lambda x: x.lower())
# removing special chars
data['text'] = data['text'].astype(str).apply((lambda x: re.sub('[^\u0980-\u09ff\s]','',x)))
#
data.head()

In [None]:
print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)
print(data[ data['sentiment'] == 'Neutral'].size)

#for idx,row in data.iterrows():
 #   row[0] = row[0].replace('rt','')
data.head()

**1. Up-sample Minority Class**

Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal.
There are several heuristics for doing so, but the most common way is to simply resample with replacement.

In [None]:
# Separate majority and minority classes
data_majority = data[data['sentiment'] == "Neutral"]
data_mid=data[data['sentiment'] == "Negative"]
data_minority = data[data['sentiment'] == "Positive"]

bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),data_mid.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),data_mid.drop(data_mid.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [None]:
print('positive data in training:',(train.sentiment == 'Positive').sum())
print('negative data in training:',(train.sentiment == 'Negative').sum())
print('neutral data in training:',(train.sentiment == 'Neutral').sum())
#print('negative data in training:',(train.sentiment == 'Negative').sum())
print('negative data in test:',(test.sentiment == 'Negative').sum())
print('positive data in test:',(test.sentiment == 'Positive').sum())
print('neutral data in test:',(test.sentiment == 'Neutral').sum())


In [None]:
# Separate majority and minority classes in training data for upsampling
data_majority = train[train['sentiment'] == 'Neutral']
data_mid = train[train['sentiment'] == 'Negative']
data_minority = train[train['sentiment'] == 'Positive']

print("majority class before upsample:",data_majority.shape)
print("mid class before upsample:",data_mid.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority,
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
data_mid_upsampled = resample(data_mid,
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_mid_upsampled,data_minority_upsampled])

# Display new class counts
print("After upsampling\n",data_upsampled.sentiment.value_counts(),sep = "")

max_fatures = 5000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
# This is the correct line. It only uses the training data.
tokenizer.fit_on_texts(data_upsampled['text'].values)
#tokenizer.fit_on_texts(data['text'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['text'].values)
X_train = pad_sequences(X_train,maxlen=29)
Y_train = pd.get_dummies(data_upsampled['sentiment']).values
print('x_train shape:',X_train.shape)

X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences(X_test,maxlen=29)
Y_test = pd.get_dummies(test['sentiment']).values
print("x_test shape", X_test.shape)

In [None]:
# First, we import everything we need for the model, including Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow   .keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional

embed_dim = 128
lstm_out = 192

model = Sequential()

model.add(Embedding(max_fatures, embed_dim, input_shape=(X_train.shape[1],)))
#model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))

model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.0)))
#model.add(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.0))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#model.build(input_shape=(None, X.shape[1]))

print(model.summary())

Here we train the Network. We should run much more than 5 epoch, but I would have to wait forever for kaggle, so it is 5 for now.

In [None]:
batch_size = 128
# also adding weights
"""class_weights = {0: 1.6/bias ,
                1: 1,
                 2:2}"""
# This is the correct line
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 1)


In [None]:
Y_pred = np.argmax(model.predict(X_test), axis=-1)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

So the class imbalance is reduced significantly recall values for some classes improved. It is alwayes not possible to reduce it compleatly.

You may also noticed some mismatched of the recall values, sometimes increasing, sometimes decreasing.  This can be improved using training model to more epocs and tuning the hyperparameters.

In [None]:
# running model to few more epochs
model.fit(X_train, Y_train, epochs = 30, batch_size=batch_size, verbose = 1,)

Y_pred = np.argmax(model.predict(X_test), axis=-1)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

In [None]:
twt = ['মাংসখিনি বৰ সুস্বাদু হৈছিল']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
print(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=29, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")
elif (np.argmax(sentiment) == 2):
    print("neutral")

In [None]:
model.save('assamese_sentiment_model.keras')
print("Model has been saved successfully as 'assamese_sentiment_model.keras'")