# RNN (Sentimental Analysis for whatsapp Chat)
## (Roman Urdu)

### Import all required libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


## Read Dataset and Show all Unique Class count

In [2]:
ROOT_PATH = 'C:/Users/Hassan Raza/Desktop/Whatsapp_chat_RNN'
dataset = pd.read_csv(ROOT_PATH + '/Whatsapp_chat.csv', usecols=[0,1])
#dataset = dataset[dataset['Mode'].isin(['Positive', 'Negative'])]
dataset["Mode"].value_counts()

Neutral     8929
Positive    6013
Negative    5287
Name: Mode, dtype: int64

In [5]:
data_pos = dataset[dataset['Mode'] == 'Positive'].iloc[:5000]
data_neg = dataset[dataset['Mode'] == 'Negative'].iloc[:5000]
data_neu = dataset[dataset['Mode'] == 'Neutral'].iloc[:5000]

dataset = pd.concat([data_pos,data_neg,data_neu])

dataset = dataset.reset_index(drop=True)

In [6]:
print(dataset['Mode'].value_counts(normalize=True))
baseline = 0.3

Neutral     0.333333
Negative    0.333333
Positive    0.333333
Name: Mode, dtype: float64


In [7]:
## Cleaning Process
data=[]
stopwords=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh', 'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya', 'gaya', 'kch', 'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to', 'is', 'hi', 'jo', 'kya', 'thi', 'se', 'pe', 'phr', 'wala', 'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski', 'ne', 'haan', 'nai', 'sent', 'aj', 'you', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya', 'dono', 'hoa', 'aese', 'de', 'wohi', 'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi', 'hova', 'yehi', 'jana', 'jye', 'chal', 'mil', 'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain', 'krny', 'tou']
for i in range(dataset.shape[0]):
    message = (re.sub('[^a-zA-Z]',' ',str(dataset.iloc[:,0].values[i]))).lower().split()
    message = [word for word in message if not word in stopwords]
    message = ' '.join(message)
    data.append(message)

In [8]:
# Total number of word is 31466 in my corpus
## The parameter in Keras Tokenizer "number_words" return the ids of the most 5000 frequent words
## By default the tokenizer split on base space " "
tokenizer = Tokenizer(num_words=5000, split=" ")
tokenizer.fit_on_texts(data)
X = tokenizer.texts_to_sequences(data)
print("Total Tokens in Data: ",len(tokenizer.word_index))
X = pad_sequences(X) # padding our text vector so they all have the same length (maximum lenth sub_list in list array)
#print(X[0])

# maxList = max(X, key = lambda i: len(i))
# maxLength = len(maxList)
maxLength = len(X[0])

Total Tokens in Data:  27471


In [9]:
model = Sequential()
model.add(Embedding(5000, 256, input_length=X.shape[1]))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 188, 256)          1280000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 188, 256)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 188, 256)          525312    
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 771       
Total params: 2,331,395
Trainable params: 2,331,395
Non-trainable params: 0
_________________________________________________________________


In [11]:
y = pd.get_dummies(dataset['Mode']).values

[print(dataset['Mode'][i], y[i]) for i in range(12000,12001)]

# Positive [0 0 1]
# Neutral [0 1 0]
# Negative [1 0 0]

Neutral [0 1 0]


[None]

In [12]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [14]:
batch_size = 256
epochs = 8
'''
By default verbose = 1,

verbose = 1, which includes both progress bar and one line per epoch

verbose = 0, means silent

verbose = 2, one line per epoch i.e. epoch no./total no. of epochs
'''

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [52]:
model.save('sentiment_analysis.h5')

In [53]:
predictions = model.predict(X_test)

[print(dataset['Message'][i], predictions[i], y_test[i]) for i in range(5, 10)]

Wah kya baat likhi [0.64506716 0.34856185 0.00637098] [0 1 0]
Wha Itni sari khubiya [0.60560066 0.37095395 0.0234454 ] [0 1 0]
Itni khubiya [0.13057727 0.8137635  0.05565921] [0 1 0]
Ya allah rehm farma hm sab pe or zalimo ko hidayat de ameen [0.06729884 0.56694263 0.36575857] [1 0 0]
Please Everyone AllAh S.w.T ka naam hAmesha Bary Lawzo main Likha kary Wo he Zaat sUb say Bari Hey [0.31156573 0.01841947 0.6700148 ] [0 0 1]


[None, None, None, None, None]

In [54]:
pos_count, neu_count, neg_count = 0, 0, 0
real_pos, real_neu, real_neg = 0, 0, 0
for i, prediction in enumerate(predictions):
    if np.argmax(prediction)==2:
        pos_count += 1
    elif np.argmax(prediction)==1:
        neu_count += 1
    else:
        neg_count += 1
    
    if np.argmax(y_test[i])==2:
        real_pos += 1
    elif np.argmax(y_test[i])==1:    
        real_neu += 1
    else:
        real_neg +=1

print('Positive predictions:', pos_count)
print('Neutral predictions:', neu_count)
print('Negative predictions:', neg_count)
print('Real positive:', real_pos)
print('Real neutral:', real_neu)
print('Real negative:', real_neg)

Positive predictions: 1120
Neutral predictions: 1835
Negative predictions: 1091
Real positive: 1182
Real neutral: 1832
Real negative: 1032


In [10]:
# new_model = tf.keras.models.load_model('sentiment_analysis.h5')
# predictions = new_model.predict(X_test)

In [15]:
# predicted_categories = tf.argmax(predictions, axis=1)
# predicted_categories[:10]

<tf.Tensor 'strided_slice_2:0' shape=(10,) dtype=int64>

In [16]:
# true_categories = tf.argmax(y_test, axis=1)
# true_categories[:10]

<tf.Tensor 'strided_slice_3:0' shape=(10,) dtype=int64>

In [18]:
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# from matplotlib import pyplot as plt
# sns.set()

# #print(y_test)

# mat = confusion_matrix(true_categories, predicted_categories)
# print(mat)

# sns.heatmap(mat.T,square=True,annot=True,fmt='d',cbar=False,
#            xticklabels=np.unique(y_test),yticklabels=np.unique(y_test))
# plt.xlabel("True Label")
# plt.ylabel("Predicted Label")

In [None]:
# import matplotlib.pyplot as plt
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])

# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train','test'], loc='upper left')
# plt.show()

# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])

# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train','test'], loc='upper left')
# plt.show()

In [9]:
#import tensorflow as tf
new_model = load_model('sentiment_analysis.h5')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [10]:
#/* ~~~~~~~~~~~ Model Predict User Input ~~~~~~~~~~~~ */
new_text = ['Wah je Waah, kya bat han', 'Lanat hy police walo py 😒', 'ye galt bat ha']
seq = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(seq, maxlen=maxLength)
for case in range(len(seq)):  
  pred = new_model.predict(padded[[case]])
  labels = ['Negative', 'Neutral', 'Positive']
  print(pred, labels[np.argmax(pred)])

[[0.14851262 0.63203645 0.21945095]] Neutral
[[0.99647385 0.0018047  0.00172148]] Negative
[[0.43794864 0.49018383 0.07186751]] Neutral


In [None]:
def startsWithDateAndTime(s):
    pattern = '[0-9]{2}/[0-9]{2}/[0-9]{4}' # for New Group settings
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [None]:
def getDataPoint(line):   
    splitLine = line.split(' - ') 
    dateTime = splitLine[0]
    date, time = dateTime.split(', ') 
    message = ' '.join(splitLine[1:])
    splitMessage = message.split(': ') 
    author = splitMessage[0] 
    message = ' '.join(splitMessage[1:])
    return date, time, author, message

In [None]:
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
### Uploading exported chat file
conversationPath = 'chat_with_Shakir.txt' # chat file
with open(conversationPath, encoding="utf-8") as fp:
    ### Skipping first line of the file because contains information related to something about end-to-end encryption
    fp.readline()
    
    messageBuffer = [] 
    date, time, author = None, None, None
    while True:
        line = fp.readline()
        #print(line)
        if not line: 
            break
        line = line.strip() 
        if startsWithDateAndTime(line):
            #print('..........')
            if len(messageBuffer) > 0:
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) 
            messageBuffer.clear() 
            date, time, author, message = getDataPoint(line) 
            messageBuffer.append(message) 
        else:
            messageBuffer.append(line)
    
print(len(parsedData))

SHChat = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
SHChat["Date"] = pd.to_datetime(SHChat["Date"])

In [None]:
# Data Cleaning process 
### Counting number of letters in each message
SHChat['Words'] = SHChat['Message'].apply(lambda s : len(s.split(' ')))
SHChat = SHChat[SHChat['Message'] != '<Media omitted>']
SHChat = SHChat[SHChat['Words'] != 1]


In [None]:
Hassan_Chat = SHChat[SHChat['Author'] == 'HR']
Shakir_Chat = SHChat[SHChat['Author'] == 'Shakir MS_27']

In [None]:
Hassan_msg = Hassan_Chat['Message'].to_list()
Shakir_msg = Shakir_Chat['Message'].to_list()
def pred_arr(msg):
  seq = tokenizer.texts_to_sequences(msg)
  padded = pad_sequences(seq, maxlen=maxLength)

  pos_count, neu_count, neg_count = 0, 0, 0
  for case in range(len(seq)):
    #print(Hassan_message[case])
    pred = model.predict(padded[[case]])
    labels = ['Negative', 'Neutral', 'Positive']
    #print(pred, labels[np.argmax(pred)])
    if np.argmax(pred)==2:
        pos_count += 1
    elif np.argmax(pred)==1:
        neu_count += 1
    else:
        neg_count += 1

  print('Positive predictions:', pos_count)
  print('Neutral predictions:', neu_count)
  print('Negative predictions:', neg_count)

  p = [neg_count, neu_count, pos_count]
  return p

data1 = np.array(pred_arr(Hassan_msg))
data2 = np.array(pred_arr(Shakir_msg))

In [None]:
myexplode = [0.1, 0.1, 0.1]

# plt.pie(p, labels = labels, explode = myexplode, shadow = True)
# plt.legend(title='Shakir',loc ="upper left",)
# plt.show() 

# plt.subplot(121)
# plt.pie(data1, labels = labels, explode = myexplode, shadow = True)
# plt.subplot(133)
# plt.pie(data2, labels = labels, explode = myexplode, shadow = True)
# plt.show()

# create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.pie(data1, # Values
        labels = labels, # Labels for each sections
        explode = myexplode, # To slice the perticuler section
        autopct = "%0.2f%%", # Show data in persentage for with 2 decimal point
        shadow = True, # Showing shadow of pie chart
        radius = 1.2, # Radius to increase or decrease the size of pie chart 
        startangle = 270, # Start angle of first section
        )
ax2.pie(data2, # Values
        labels = labels, # Labels for each sections
        explode = myexplode, # To slice the perticuler section
        autopct = "%0.2f%%", # Show data in persentage for with 2 decimal point
        shadow = True, # Showing shadow of pie chart
        radius = 1.2, # Radius to increase or decrease the size of pie chart 
        startangle = 270, # Start angle of first section
        )
# plot each pie chart in a separate subplot
#ax1.pie(data1, labels = labels, explode = myexplode, shadow = True)
#ax2.pie(data2, labels = labels, explode = myexplode, shadow = True)
plt.show()