## **1. LOADING THE REQUIRED LIBRARIES**

In [37]:
import pandas as pd
import numpy as np
import plotly.express as px
import re

In [18]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chinm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chinm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chinm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
from gensim.models import Word2Vec

## **2. LOADING THE DATASET IN TRAINING AND TEST SETS**

In [20]:
train_data = pd.read_csv('Corona_NLP_train.csv')
test_data = pd.read_csv('Corona_NLP_test.csv')

In [21]:
train_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [22]:
test_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [23]:
train_data.info() # Basic Information
print("\nShape: ",train_data.shape) # Shape of data
print("\nChecking for missing values")
train_data.isnull().sum()# Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB

Shape:  (41157, 6)

Checking for missing values


UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [24]:
#Dropping unnecessary columns
train_data.drop(['Location','ScreenName','UserName','TweetAt'],axis=1,inplace=True)
test_data.drop(['Location','ScreenName','UserName','TweetAt'],axis=1,inplace=True)

In [25]:
train_data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [26]:
print("Count of sentiment wise values: \n",train_data.Sentiment.value_counts())

Count of sentiment wise values: 
 Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64


In [27]:
fig=px.histogram(train_data,
                x='Sentiment',
                title='Sentiment Count ',
                color_discrete_sequence=['green'])
fig.update_layout(bargap=0.1)
fig.show()
fig = px.pie(train_data, names='Sentiment',title="Sentiment Distribution")
fig.show()

#### inferences:
- Maximum tweets belong to Positive sentiment followed by negative
- the positive to negative ratio is quite similar.

In [28]:
#Merging data columns Extremely Neagtive Sentiment as Negative and Extremely Positive Sentiment as Positive.
def merge(df):
    df['Sentiment'].replace(to_replace='Extremely Negative',value='Negative',inplace=True)
    df['Sentiment'].replace(to_replace='Extremely Positive',value='Positive',inplace=True)
merge(train_data)
merge(test_data)


In [29]:
train_data['Sentiment'].unique()
test_data['Sentiment'].unique()

array(['Negative', 'Positive', 'Neutral'], dtype=object)

In [30]:
fig = px.pie(train_data, names='Sentiment',title="Sentiment Distribution")
fig.show()

#### Inferences
- Positive tweets still remain maximum followed by negative and neutral respectively

## **3. DATA PREPROCESSING**

In [31]:
##Function to extract the tweet length   
def text_length(tweet):
    str_len=len(tweet.split(" "))
    return(str_len)
train_data['OriginalLength']= train_data['OriginalTweet'].apply(lambda x:text_length(x))
train_data.head()

Unnamed: 0,OriginalTweet,Sentiment,OriginalLength
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,8
1,advice Talk to your neighbours family to excha...,Positive,38
2,Coronavirus Australia: Woolworths to give elde...,Positive,14
3,My food stock is not the only one which is emp...,Positive,40
4,"Me, ready to go at supermarket during the #COV...",Negative,38


In [32]:
for i in range (5):
    print(train_data.OriginalTweet[i])
    print("========================================")

@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8
advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order
Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P
My food stock is not the only one which is empty...

PLEASE, don't panic, THERE WILL BE ENOUGH FOOD FOR EVERYONE if you do not take more than you need. 
Stay calm, stay safe.

#COVID19france #COVID_19 #COVID19 #coronavirus #confinement #Confinementotal #ConfinementGeneral https://t.co/zrlG0Z520j
Me, ready to go at supermarket during the #COVID19 outbreak.

Not because I'm paranoid, but because my food stock is litteraly empty. The #coronavirus is a serious thing, but please, don't panic. It causes shortage...

#C

In [33]:
train_data['OriginalLength'].describe()

count    41157.000000
mean        30.327818
std         11.633754
min          1.000000
25%         21.000000
50%         31.000000
75%         40.000000
max        127.000000
Name: OriginalLength, dtype: float64

In [34]:
fig = px.histogram(train_data,
                  x='OriginalLength',
                  marginal='box',
                  title="Length of original tweets")
fig.update_layout(bargap=0.1)
fig.show()

#### Inferences:
- Somewhat normal distribution skewed towards right
- outliers are present
- Range of length lies b/w 1 to 127

####  Need for preprocessing
- This tweets contain words and parts that do not contribute to the sentiment it conveys.
- e.g. : Links, Symbols ,Numbers, Carriage returns , Spaces, URLS, etc
- Hence it is necessary to clean the data for better performance

In [38]:
import string

def data_cleaning(data):
    cleaned_data = []
    fillerWord = ("so","yeah","okay","um","uh","mmm","ahan","uh","huh","ahm","oh","sooo","uh","huh","yeh","yah","hmm","bye")
    fillerword_reg= "bye[.,]|so[.,]|yeah[.,]|okay[.,]|um[.,]|uh[.,]|mmm[.,]|ahan[.,]|uh[.,]|huh[.,]|ahm[.,]|oh[.,]|sooo[.,]|uh[.,]|huh[.,]|yeh[.,]|yah[.,]|hmm[.,]"
    STOPWORDS = set(stopwords.words('english'))
    remove=["doesn't","not","nor","neither","isn't","hadn't","mightn't","needn't","wasn't"]
    for i in remove:
        STOPWORDS.discard(i)
    
    STOPWORDS.add(fillerWord)  
    for i in range(len(data)):
        tweet = re.sub("#", "", data[i])#extracting hashtags
        tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '',tweet, flags=re.MULTILINE)#extracting links
        html=re.compile(r'<.*?>')#extracting html tags
        tweet =html.sub(r"", tweet)
        #extracting symbols and characters
        tweet=re.sub(r'@\w+',"",tweet)
        tweet=re.sub(r'#\w+',"",tweet) 
        tweet=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet) 
        punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
        tweet.rstrip(string.punctuation)
        tweet=re.sub('[^A-Za-z\s]+',"", tweet)
        tweet = tweet.lower()
        tweet = tweet.split()
        #Lemmatization to normalise text
        tweet = [lemmatizer.lemmatize(word) for word in tweet if not word in STOPWORDS]
        tweet = ' '.join(tweet)
        filler=re.compile(fillerword_reg)
        tweet=filler.sub("",tweet)
        cleaned_data.append(tweet)
    return cleaned_data


invalid escape sequence '\s'


invalid escape sequence '\s'


invalid escape sequence '\s'



In [39]:
####Preprocessing and cleaning train data
texto_data=train_data['OriginalTweet'].tolist()
process_text=data_cleaning(texto_data)

cleaned_tweet =pd.DataFrame(process_text)
train_data['CleanedTweet']=cleaned_tweet
#cleaned_tweet.head()


train_data.head()

Unnamed: 0,OriginalTweet,Sentiment,OriginalLength,CleanedTweet
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,8,
1,advice Talk to your neighbours family to excha...,Positive,38,advice talk neighbour family exchange phone nu...
2,Coronavirus Australia: Woolworths to give elde...,Positive,14,coronavirus australia woolworth give elderly d...
3,My food stock is not the only one which is emp...,Positive,40,food stock not one empty please dont panic eno...
4,"Me, ready to go at supermarket during the #COV...",Negative,38,ready go supermarket covid outbreak not im par...


In [40]:
####Preprocessing and cleaning test data
test_data['OriginalLength']= test_data['OriginalTweet'].apply(lambda x:text_length(x))
texto_data=test_data['OriginalTweet'].tolist()
process_text=data_cleaning(texto_data)

cleaned_tweet =pd.DataFrame(process_text)
test_data['CleanedTweet']=cleaned_tweet
#cleaned_tweet.head()
test_data.head()

Unnamed: 0,OriginalTweet,Sentiment,OriginalLength,CleanedTweet
0,TRENDING: New Yorkers encounter empty supermar...,Negative,23,trending new yorkers encounter empty supermark...
1,When I couldn't find hand sanitizer at Fred Me...,Positive,31,couldnt find hand sanitizer fred meyer turned ...
2,Find out how you can protect yourself and love...,Positive,13,find protect loved one coronavirus
3,#Panic buying hits #NewYork City as anxious sh...,Negative,35,panic buying hit newyork city anxious shopper ...
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,30,toiletpaper dunnypaper coronavirus coronavirus...


In [41]:
#Adding length of cleaned tweet to dataset
train_data['NewLength']= train_data['CleanedTweet'].apply(lambda x:text_length(x))
test_data['NewLength']= test_data['CleanedTweet'].apply(lambda x:text_length(x))
train_data.head()
#test_data.head()

Unnamed: 0,OriginalTweet,Sentiment,OriginalLength,CleanedTweet,NewLength
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,8,,1
1,advice Talk to your neighbours family to excha...,Positive,38,advice talk neighbour family exchange phone nu...,28
2,Coronavirus Australia: Woolworths to give elde...,Positive,14,coronavirus australia woolworth give elderly d...,12
3,My food stock is not the only one which is emp...,Positive,40,food stock not one empty please dont panic eno...,25
4,"Me, ready to go at supermarket during the #COV...",Negative,38,ready go supermarket covid outbreak not im par...,24


In [42]:
print(train_data.NewLength.describe())
fig = px.histogram(train_data,
                  x='NewLength',
                  marginal='box',
                  title="Length of cleaned tweets",
                  color_discrete_sequence=['green'])
fig.update_layout(bargap=0.1)
fig.show()

count    41157.000000
mean        17.667347
std          6.632804
min          1.000000
25%         13.000000
50%         18.000000
75%         23.000000
max         40.000000
Name: NewLength, dtype: float64


#### Inferences:
- Range changed significantly to 0-40
- No of outliers decreased

## **4. SEPERATING FEATURES AND RESPONSES**

In [43]:
X_train = train_data.CleanedTweet
Y_train = train_data.Sentiment

#### Label Encoding of response i.e. Categorical Variables

In [44]:
from sklearn.preprocessing import LabelEncoder

In [45]:
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
#encoder.classes_

In [46]:
Y_train = pd.DataFrame(Y_train,columns=['Sentiment'])
Y_train.head()

Unnamed: 0,Sentiment
0,1
1,2
2,2
3,2
4,0


In [47]:
test_data.head()

Unnamed: 0,OriginalTweet,Sentiment,OriginalLength,CleanedTweet,NewLength
0,TRENDING: New Yorkers encounter empty supermar...,Negative,23,trending new yorkers encounter empty supermark...,18
1,When I couldn't find hand sanitizer at Fred Me...,Positive,31,couldnt find hand sanitizer fred meyer turned ...,14
2,Find out how you can protect yourself and love...,Positive,13,find protect loved one coronavirus,5
3,#Panic buying hits #NewYork City as anxious sh...,Negative,35,panic buying hit newyork city anxious shopper ...,26
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,30,toiletpaper dunnypaper coronavirus coronavirus...,23


In [48]:
X_test = test_data.CleanedTweet
Y_test = test_data.Sentiment

In [49]:
X_test.head()
Y_test.head()

0    Negative
1    Positive
2    Positive
3    Negative
4     Neutral
Name: Sentiment, dtype: object

In [50]:
Y_test = encoder.fit_transform(Y_test)

In [51]:
Y_test = pd.DataFrame(Y_test,columns=['Sentiment'])

In [52]:
Y_test.head()

Unnamed: 0,Sentiment
0,0
1,2
2,2
3,0
4,1


## **5. USING MULTINOMIAL NAIVE BAYES CLASSIFIER FOR MULTICLASS CLASSIFICATION**

##### Initialising bag of word

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
vectorizer = CountVectorizer(max_features= 3000, min_df = 3, max_df = 0.6)
# fit the model for training data
X_train_data = vectorizer.fit_transform(X_train)
X_train_data.shape

(41157, 3000)

In [55]:
X_test_data = vectorizer.fit_transform(X_test)

In [56]:
X_train_data.shape

(41157, 3000)

In [57]:
Y_train.shape

(41157, 1)

In [58]:
from sklearn.feature_extraction.text import TfidfTransformer

In [59]:
transformer = TfidfTransformer()
X_train_data = transformer.fit_transform(X_train_data).toarray()

print(X_train_data.shape)
#X_train_data[0]

(41157, 3000)


In [60]:
Y_train.head()

Unnamed: 0,Sentiment
0,1
1,2
2,2
3,2
4,0


### Naive Bayes Multinomial Classifier

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [62]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X_train_data, Y_train, test_size=0.33,random_state = 15)

In [63]:
model1_nb = MultinomialNB()
y_train_pred = model1_nb.fit(X1_train,y1_train.Sentiment).predict(X1_train)

y_pred = model1_nb.predict(X1_test)
  
# comparing actual response values  with predicted response values 
from sklearn import metrics
print("Multinomial Naive Bayes model train accuracy(in %):", metrics.accuracy_score(y1_train, y_train_pred)*100)
print("Multinomial Naive Bayes model test accuracy(in %):", metrics.accuracy_score(y1_test, y_pred)*100)

Multinomial Naive Bayes model train accuracy(in %): 70.8286491387126
Multinomial Naive Bayes model test accuracy(in %): 66.58813135031659


#### Inferences:
- Very Poor performance on test data when train_test_split was not used
- Better performance on test data while using train_test_split
- Overall accuracy of the model shows average performance


#### HyperParameter Tuning Using Grid Search

In [64]:
from sklearn.model_selection import GridSearchCV

In [65]:
params = {'alpha': [0.01,0.1,0.5,1,10],
         }

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(X_train_data,Y_train)

print('Train Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X1_train, y1_train))
print('Test Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X1_test, y1_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_nb_grid.best_score_)
print('Best Parameters : ',multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Train Accuracy : 0.709
Test Accuracy : 0.710
Best Accuracy Through Grid Search : 0.670
Best Parameters :  {'alpha': 0.1}


In [68]:
y_pred

array([2, 0, 0, ..., 2, 0, 2])

In [69]:
Y_test

Unnamed: 0,Sentiment
0,0
1,2
2,2
3,0
4,1
...,...
3793,2
3794,0
3795,1
3796,0
