In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
#Reading the dataset with encoding parameter set to ‘latin1’
df = pd.read_csv('Corona_NLP.csv', encoding = 'latin-1')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [4]:
df.shape

(41157, 6)

In [5]:
df.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

In [6]:
df.isnull().sum()
#Only Location column contains 8590 null values. 
#But this column is not useful for our sentiment analysis.Hence we will neglect these null values.

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [7]:
#Remove handle null values (if any).
df = df.fillna(method ='pad')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
df['Sentiment'].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [9]:
#Function for removing @user
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt

In [10]:
#Creating new column with removed @user
df['Tweet'] = np.vectorize(remove_pattern)(df['OriginalTweet'], '@[\w]*')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,https://t.co/iFz9FAn2Pa and https://t.co/xX...
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice Talk to your neighbours family to excha...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,Coronavirus Australia: Woolworths to give elde...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,My food stock is not the only one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"Me, ready to go at supermarket during the #COV..."


In [11]:
#Removed HTTP And URLS from Tweet
df['Tweet'] = df['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice Talk to your neighbours family to excha...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,Coronavirus Australia: Woolworths to give elde...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,My food stock is not the only one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"Me, ready to go at supermarket during the #COV..."


In [12]:
#Removed special characters, numbers, punctuations
df['Tweet'] = df['Tweet'].str.replace('[^a-zA-Z#]+',' ')
df.head()

  df['Tweet'] = df['Tweet'].str.replace('[^a-zA-Z#]+',' ')


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice Talk to your neighbours family to excha...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,Coronavirus Australia Woolworths to give elder...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,My food stock is not the only one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,Me ready to go at supermarket during the #COVI...


In [13]:
#Convert words to lower case
df["Tweet"] = df["Tweet"].str.lower()
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the #covi...


In [14]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [16]:
#Removing Stopwords
df['Tweet'].apply(lambda x: [item for item in x if item not in stop])
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the #covi...


In [17]:
#Creating new variable tokenized tweet 
tokenized_tweet = df['Tweet'].apply(lambda x: x.split())

In [18]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [19]:
#Applying stemmer for tokenized_tweet
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the #covi...


In [20]:
#Joining tokens into one sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

In [21]:
df['Tweet']  = tokenized_tweet
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advic talk to your neighbour famili to exchang...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronaviru australia woolworth to give elderli...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,my food stock is not the onli one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me readi to go at supermarket dure the #covid ...


In [22]:
#Converting the 'Extremely Positive' and 'Extremely Negative' Sentiments to 'Positive' and 'Negative' sentiments respectively
df['Sentiment'] = df['Sentiment'].replace(['Extremely Negative','Extremely Positive'],['Negative','Positive'])
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advic talk to your neighbour famili to exchang...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,coronaviru australia woolworth to give elderli...
3,3802,48754,Vagabonds,16-03-2020,My food stock is not the only one which is emp...,Positive,my food stock is not the onli one which is emp...
4,3803,48755,Vagabonds,16-03-2020,"Me, ready to go at supermarket during the #COV...",Negative,me readi to go at supermarket dure the #covid ...


In [23]:
#Spitting Our Dataset into Training And Testing Dataset
from sklearn.model_selection import train_test_split

train,test = train_test_split(df,test_size = 0.2,random_state=0,stratify = df.Sentiment.values)
print("train shape : ", train.shape)
print("test shape : ", test.shape)

train shape :  (32925, 7)
test shape :  (8232, 7)


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
vectorizer = CountVectorizer(decode_error = 'replace',stop_words = stop)

In [25]:
#Using Counter Vectorizer For Multi Class Classification
X_train = vectorizer.fit_transform(train.Tweet.values)
X_test = vectorizer.transform(test.Tweet.values)

Y_train = train.Sentiment.values
Y_test = test.Sentiment.values

print("X_train.shape : ", X_train.shape)
print("X_test.shape : ", X_test.shape)
print("y_train.shape : ", Y_train.shape)
print("y_test.shape : ", Y_test.shape)

X_train.shape :  (32925, 27328)
X_test.shape :  (8232, 27328)
y_train.shape :  (32925,)
y_test.shape :  (8232,)


In [26]:
#Using Naive Bayes Classifier for MULTICLASS Classification
from sklearn.naive_bayes import MultinomialNB
naiveByes_clf = MultinomialNB()
naiveByes_clf.fit(X_train,Y_train)

MultinomialNB()

In [27]:
predict1 = naiveByes_clf.predict(X_test)
predict1

array(['Positive', 'Neutral', 'Positive', ..., 'Positive', 'Positive',
       'Positive'], dtype='<U8')

In [28]:
from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(predict1,Y_test)
print(cm)
print(classification_report(predict1,Y_test))

[[2344  487  617]
 [  77  398   96]
 [ 659  658 2896]]
              precision    recall  f1-score   support

    Negative       0.76      0.68      0.72      3448
     Neutral       0.26      0.70      0.38       571
    Positive       0.80      0.69      0.74      4213

    accuracy                           0.68      8232
   macro avg       0.61      0.69      0.61      8232
weighted avg       0.75      0.68      0.71      8232



In [29]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")

In [30]:
svc.fit(X_train, Y_train) 

SVC(kernel='linear')

In [31]:
predict2 = svc.predict(X_test)
predict2

array(['Positive', 'Neutral', 'Positive', ..., 'Neutral', 'Positive',
       'Positive'], dtype=object)

In [32]:
cm2 = confusion_matrix(Y_test,predict2)
print(cm2)
print(classification_report(Y_test,predict2))

[[2500  229  351]
 [ 252 1090  201]
 [ 396  258 2955]]
              precision    recall  f1-score   support

    Negative       0.79      0.81      0.80      3080
     Neutral       0.69      0.71      0.70      1543
    Positive       0.84      0.82      0.83      3609

    accuracy                           0.80      8232
   macro avg       0.78      0.78      0.78      8232
weighted avg       0.80      0.80      0.80      8232



In [33]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)

In [34]:
knn.fit(X_train,Y_train)

KNeighborsClassifier(n_neighbors=2)

In [35]:
predict3 = knn.predict(X_test)
predict3

array(['Neutral', 'Neutral', 'Negative', ..., 'Neutral', 'Neutral',
       'Positive'], dtype=object)

In [36]:
cm3 = confusion_matrix(Y_test,predict3)
print(cm3)
print(classification_report(Y_test,predict3))

[[1360 1629   91]
 [ 200 1305   38]
 [ 675 2389  545]]
              precision    recall  f1-score   support

    Negative       0.61      0.44      0.51      3080
     Neutral       0.25      0.85      0.38      1543
    Positive       0.81      0.15      0.25      3609

    accuracy                           0.39      8232
   macro avg       0.55      0.48      0.38      8232
weighted avg       0.63      0.39      0.37      8232

