# Whatsapp Sentiment Analysis

## Imports

In [97]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
# Download twitter data and sentiment analysis model
import nltk
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import twitter_samples

import re
import string
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/laurabonnet/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/laurabonnet/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/laurabonnet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data

In [98]:
df = pd.read_csv('/Users/laurabonnet/Downloads/clean_nus_sms 2.csv').drop(columns = 'Unnamed: 0').set_index('id')
df.head()

Unnamed: 0_level_0,Message,length,country,Date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10120,Bugis oso near wat...,21,SG,2003/4
10121,"Go until jurong point, crazy.. Available only ...",111,SG,2003/4
10122,I dunno until when... Lets go learn pilates...,46,SG,2003/4
10123,Den only weekdays got special price... Haiz......,140,SG,2003/4
10124,Meet after lunch la...,22,SG,2003/4


## Check Data

### Overview

In [99]:
df.describe()

Unnamed: 0,Message,length,country,Date
count,48595,48598,48598,48598
unique,48586,470,40,27
top,Oh,20,Singapore,2011/3
freq,3,861,22013,10435


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48598 entries, 10120 to 45718
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Message  48595 non-null  object
 1   length   48598 non-null  object
 2   country  48598 non-null  object
 3   Date     48598 non-null  object
dtypes: object(4)
memory usage: 1.9+ MB


### Check null values

In [101]:
df[df['Message'].isnull() == True]

Unnamed: 0_level_0,Message,length,country,Date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13749,,4,SG,2003/4
14927,,3,SG,2003/4
43451,,4,USA,2014/9


In [102]:
df = df.dropna()

### Short Messages

In [103]:
#convert length to float
df['length'] = pd.to_numeric(df['length'], errors ='coerce')


In [104]:
#Only keep messages with more than 3 characters
df = df[df['length'] > 3]

In [105]:
df.tail()

Unnamed: 0_level_0,Message,length,country,Date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
45712,Come to me AFTER NOON,21.0,Serbia,2015/3
45714,I LOVE YOU TOO,14.0,Serbia,2015/3
45715,C-YA,4.0,Serbia,2015/3
45717,BE MY GUEST,11.0,Serbia,2015/3
45718,MANY MANY MANY PEOPLE,21.0,Serbia,2015/3


## Cleaning Text

In [106]:
#function to remove punctuation, number, stopwords and lemmatize text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def clean_text(text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    lemmatizer = WordNetLemmatizer() 
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('English')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words]# Remove Stop Words
    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords]
    
    return " ".join(lemmatized)

In [107]:
#Cleaning Message
df['clean_message']=df['Message'].apply(remove_urls)
df['clean_message']=df['Message'].apply(clean_text)

In [108]:
df.head()

Unnamed: 0_level_0,Message,length,country,Date,clean_message
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10120,Bugis oso near wat...,21.0,SG,2003/4,bugis oso near wat
10121,"Go until jurong point, crazy.. Available only ...",111.0,SG,2003/4,go jurong point crazy available bugis n great ...
10122,I dunno until when... Lets go learn pilates...,46.0,SG,2003/4,dunno let go learn pilate
10123,Den only weekdays got special price... Haiz......,140.0,SG,2003/4,den weekday got special price haiz cant eat li...
10124,Meet after lunch la...,22.0,SG,2003/4,meet lunch la


## Sentiment Analysis

### Prep work

In [109]:
# Building a dataframe with positive messages, noted 1 in the sentiment column
positive_df = pd.DataFrame(twitter_samples.strings('positive_tweets.json'), columns = ['Message'])
positive_df['sentiment'] = 1

In [110]:
# Building a dataframe with negative messages, noted 0 in the sentiment column
negative_df = pd.DataFrame(twitter_samples.strings('negative_tweets.json'), columns = ['Message'])
negative_df['sentiment'] = 0

In [111]:
# building a dataframe with all messages, and rearranging randomly the order of the rows
tweet_df = pd.concat([positive_df,negative_df]).reset_index(drop = True).sample(frac=1)

In [112]:
#cleaning messages
tweet_df['clean_message']=tweet_df['Message'].apply(clean_text)
tweet_df.head()

Unnamed: 0,Message,sentiment,clean_message
3845,@F41rygirl @paintingandbook You saying you wan...,1,paintingandbook saying want lucy gone soon lis...
4948,all i've done today is watch law &amp; order: ...,1,done today watch law amp order svu love sick
8173,"shakes my head repeatedly. nu-uh, jace, i love...",0,shake head repeatedly nu uh jace love mostest gt
3784,@susie299 gorgeous body :) wonder how close yo...,1,gorgeous body wonder close
7360,@sarahbournex me too :( determined not to feel...,0,sarahbournex determined feel ill tonight


### Training model

In [113]:
#creating a test and train sest to train a model
X = tweet_df['clean_message']
y = tweet_df['sentiment']
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.30)

In [114]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


nb_model = MultinomialNB(alpha = 1)
results = cross_validate(nb_model, X_train_vec, y_train, cv=10)
nb_model.fit(X_train_vec,y_train)
nb_model.score(X_test_vec,y_test)

0.7463333333333333

In [115]:
results['test_score'].mean()

0.7464285714285714

In [116]:
# Checking best params for our model

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'nb__alpha': (0,0.1,0.01,0.5,1),
    'tfidf__max_df' : (0.01,0.05,0.1,0.5,0.7,1)
}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(X_train,y_train)
grid_search.best_params_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    7.8s finished


{'nb__alpha': 1, 'tfidf__max_df': 0.5}

### Adding sentiment to DataFrame

In [117]:
#addidng sentiment array
df['sentiment'] = nb_model.predict(vectorizer.transform(df['Message']))

In [118]:
df['sentiment'].value_counts()

0    29398
1    18843
Name: sentiment, dtype: int64

### Sentiment by country

In [129]:
# 10 most positive countries
df = df.replace({'country':{'SG':'Singapore', 
                            'USA':'United States',
                            'india':'India',
                            'INDIA':'India',
                            'srilanka':'Sri Lanka',
                            'UK':'United Kingdom',
                            'BARBADOS':'Barbados',
                            'jamaica':'Jamaica',
                            'MY':'Malaysia',
                            'unknown':'Unknown'}})

df.groupby('country').mean().sort_values('sentiment',ascending=False).head(10)

Unnamed: 0_level_0,length,sentiment
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Morocco,11.777778,0.666667
Turkey,24.2,0.6
Trinidad and Tobago,48.1,0.6
Spain,39.6,0.6
France,30.0,0.6
New Zealand,18.1,0.6
Nepal,18.435897,0.589744
Kenya,48.85,0.55
Pakistan,42.639232,0.537723
Serbia,25.0,0.5


## Most common Topic

In [132]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=2).fit(vectorizer.transform(df['Message']))

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-5 - 1:-1]])
        

print_topics(lda_model, vectorizer)


Topic 0:
[('haha', 1412.1310350938113), ('be', 1148.683638542199), ('lol', 1122.6874936425436), ('ok', 1032.390211702745), ('got', 722.4759650575284)]
Topic 1:
[('can', 1925.6083766359784), ('le', 671.0568875802817), ('he', 621.39273848283), ('go', 598.4296335881584), ('haha', 589.3487747071433)]


## Use POS tagging to find commonly used phrases