# Load Libraries

In [1]:
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import sklearn 

In [2]:
# download the stopwords
nltk.download('stopwords')
stopwords = stopwords_english = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


#  Load Data

In [3]:
train_df = pd.read_csv('/Users/alijanatiidr/Desktop/Prog/Projects/Corona_tweets_sentiment_analysis/Corona_NLP_train.csv', encoding='latin-1')
test_df = pd.read_csv('/Users/alijanatiidr/Desktop/Prog/Projects/Corona_tweets_sentiment_analysis/Corona_NLP_test.csv', encoding='latin-1')

In [4]:
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [5]:
test_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


# Implement helpers functions

In [6]:
# Define process_tweet function that takes a tweet, tokenizes it, removes stopwords, stems words and lowercases it
def process_tweet(tweet):
    stemmer = PorterStemmer()
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean


In [7]:
tweet = train_df['OriginalTweet'][1]
print(process_tweet(tweet))

['advic', 'talk', 'neighbour', 'famili', 'exchang', 'phone', 'number', 'creat', 'contact', 'list', 'phone', 'number', 'neighbour', 'school', 'employ', 'chemist', 'gp', 'set', 'onlin', 'shop', 'account', 'poss', 'adequ', 'suppli', 'regular', 'med', 'order']


In [8]:
# Define build_freqs function that takes a list of tweets and labels and returns a dictionary with the frequency of each word
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [9]:
tweets = ['i am very happy', 'i am very sad', 'i am happy', 'i am sad', 'i am going to school', 'i am eating pasta']
ys = np.array([[2], [-2], [1], [-1], [0], [0]])
freqs = build_freqs(tweets, ys)
print('freqs = ', freqs)

freqs =  {('happi', 2): 1, ('sad', -2): 1, ('happi', 1): 1, ('sad', -1): 1, ('go', 0): 1, ('school', 0): 1, ('eat', 0): 1, ('pasta', 0): 1}


# Explore Data

In [10]:
train_df.columns.tolist(), test_df.columns.tolist()

(['UserName',
  'ScreenName',
  'Location',
  'TweetAt',
  'OriginalTweet',
  'Sentiment'],
 ['UserName',
  'ScreenName',
  'Location',
  'TweetAt',
  'OriginalTweet',
  'Sentiment'])

In [11]:
# Dropping username, screenname, location, tweetAt columns because they are not useful for our analysis
train_df = train_df.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1)
test_df = test_df.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1)

In [12]:
# Looking at NaN Values
train_df.isnull().sum(), test_df.isnull().sum()

(OriginalTweet    0
 Sentiment        0
 dtype: int64,
 OriginalTweet    0
 Sentiment        0
 dtype: int64)

In [13]:
# Computing training labels proportion
train_labels_proportion = pd.DataFrame(train_df['Sentiment'].value_counts())
train_labels_proportion['labels'] = train_labels_proportion.index
train_labels_proportion.columns = ['count', 'labels'] 
train_labels_proportion.reset_index(drop=True, inplace=True)
train_labels_proportion = train_labels_proportion[['labels', 'count']]

In [14]:
# Plotting the proportion of labels in the training set
fig = px.pie(train_labels_proportion, values='count', names='labels', title='Proportion of labels in the training set')
fig.show()


In [15]:
# Computing test labels proportion
test_labels_proportion = pd.DataFrame(test_df['Sentiment'].value_counts())
test_labels_proportion['labels'] = test_labels_proportion.index
test_labels_proportion.columns = ['count', 'labels']
test_labels_proportion.reset_index(drop=True, inplace=True)
test_labels_proportion = test_labels_proportion[['labels', 'count']]
test_labels_proportion

Unnamed: 0,labels,count
0,Negative,1041
1,Positive,947
2,Neutral,619
3,Extremely Positive,599
4,Extremely Negative,592


In [16]:
# Plotting the proportion of labels in the test set
fig = px.pie(test_labels_proportion, values='count', names='labels', title='Proportion of labels in the test set')
fig.show()

In [17]:
# store the labels as numbers in a dictionary
labels_dict = {'Extremely Negative': -2, 'Negative': -1, 'Neutral': 0, 'Positive': 1, 'Extremely Positive': 2}

In [18]:
# map the labels to the training set and test set
train_df['labels'] = train_df['Sentiment'].map(labels_dict)
test_df['labels'] = test_df['Sentiment'].map(labels_dict)

# drop the Sentiment column
train_df = train_df.drop(['Sentiment'], axis=1)
test_df = test_df.drop(['Sentiment'], axis=1)

# Modelize sentiment analysis

In [19]:
# create validation set that contains same stratified proportion of labels as the training set
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['labels'], random_state=42)

In [20]:
# create frequency dictionary
freqs = build_freqs(train_df['OriginalTweet'], train_df['labels'])
freqs

{('consum', 2): 638,
 ('voic', 2): 5,
 ('compil', 2): 5,
 ('list', 2): 79,
 ('creativ', 2): 29,
 ('idea', 2): 58,
 ('best', 2): 272,
 ('practic', 2): 74,
 ('stay', 2): 425,
 ('connect', 2): 26,
 ('pandem', 2): 401,
 ('includ', 2): 93,
 ('way', 2): 235,
 ('commun', 2): 161,
 ('love', 2): 238,
 ('one', 2): 290,
 ('activ', 2): 33,
 ('engag', 2): 19,
 ('isol', 2): 49,
 ('europ', 1): 28,
 ('amp', 1): 318,
 ('central', 1): 27,
 ('asia', 1): 13,
 ('region', 1): 25,
 ('growth', 1): 39,
 ('expect', 1): 109,
 ('fall', 1): 87,
 ('recess', 1): 16,
 ('2020', 1): 183,
 ('held', 1): 12,
 ('back', 1): 206,
 ('pandem', 1): 741,
 ('rebound', 1): 10,
 ('2021', 1): 7,
 ('polici', 1): 40,
 ('measur', 1): 128,
 ('introduc', 1): 23,
 ('global', 1): 141,
 ('commod', 1): 59,
 ('price', 1): 1828,
 ('recov', 1): 20,
 ('trade', 1): 69,
 ('strengthen', 1): 7,
 ('see', -1): 310,
 ('empti', -1): 245,
 ('shelv', -1): 326,
 ('groceri', -1): 1176,
 ('store', -1): 1453,
 ('morn', -1): 84,
 ('realli', -1): 123,
 ('stir',

## Model 1: Scikit learn logistic regression

In [21]:
# Trying scikit learn logistic regression knowing that we already have the frequency dictionary 
from sklearn.linear_model import LogisticRegression

# vectorizing tweets using the frequency dictionary
processed_X_train = [process_tweet(tweet) for tweet in train_df['OriginalTweet']]
X_vectors = np.zeros((len(processed_X_train), 6))
for i in range(len(processed_X_train)):
    for word in processed_X_train[i]:
        X_vectors[i, 0] = 1
        X_vectors[i, 1] += freqs.get((word, 2), 0)
        X_vectors[i, 2] += freqs.get((word, 1), 0)
        X_vectors[i, 3] += freqs.get((word, 0), 0)
        X_vectors[i, 4] += freqs.get((word, -1), 0)
        X_vectors[i, 5] += freqs.get((word, -2), 0)

x_train = X_vectors
y_train = train_df['labels']

In [22]:
# Training the model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(x_train, y_train)

# vectorizing validation set
processed_X_val = [process_tweet(tweet) for tweet in val_df['OriginalTweet']]
X_vectors = np.zeros((len(processed_X_val), 6))
for i in range(len(processed_X_val)):
    for word in processed_X_val[i]:
        X_vectors[i, 0] = 1
        X_vectors[i, 1] += freqs.get((word, 2), 0)
        X_vectors[i, 2] += freqs.get((word, 1), 0)
        X_vectors[i, 3] += freqs.get((word, 0), 0)
        X_vectors[i, 4] += freqs.get((word, -1), 0)
        X_vectors[i, 5] += freqs.get((word, -2), 0)

x_val = X_vectors
y_val = val_df['labels']

# Predicting on validation set
y_pred = clf.predict(x_val)

# Computing accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred)

0.37038386783284744

## Model 2: 2-class classification: positive vs negative

In [23]:
# implementing binary classification for positive and negative tweets
train_df['labels'] = train_df['labels'].apply(lambda x: 1 if x > 0 else 0)
val_df['labels'] = val_df['labels'].apply(lambda x: 1 if x > 0 else 0)
test_df['labels'] = test_df['labels'].apply(lambda x: 1 if x > 0 else 0)

# create frequency dictionary
freqs = build_freqs(train_df['OriginalTweet'], train_df['labels'])


In [24]:
# vectorizing tweets using the frequency dictionary
processed_X_train = [process_tweet(tweet) for tweet in train_df['OriginalTweet']]
X_vectors = np.zeros((len(processed_X_train), 3))
for i in range(len(processed_X_train)):
    for word in processed_X_train[i]:
        X_vectors[i, 0] = 1
        X_vectors[i, 1] += freqs.get((word, 1), 0)
        X_vectors[i, 2] += freqs.get((word, 0), 0)

x_train = X_vectors
y_train = train_df['labels']

# Training the model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(x_train, y_train)

# vectorizing validation set
processed_X_val = [process_tweet(tweet) for tweet in val_df['OriginalTweet']]
X_vectors = np.zeros((len(processed_X_val), 3))
for i in range(len(processed_X_val)):
    for word in processed_X_val[i]:
        X_vectors[i, 0] = 1
        X_vectors[i, 1] += freqs.get((word, 1), 0)
        X_vectors[i, 2] += freqs.get((word, 0), 0)

x_val = X_vectors
y_val = val_df['labels']

# Predicting on validation set
y_pred = clf.predict(x_val)

# Computing accuracy
accuracy_score(y_val, y_pred)



0.6637512147716229

## Model 3: 2-class classification gradient descent

In [25]:
# implementing gradient descent for logistic regression using (x_train, y_train)
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def log_loss(y, y_hat):
    m = y.shape[0]
    cost = -(1 / m) * (np.dot(y.T, np.log(y_hat)) + np.dot((1 - y).T, np.log(1 - y_hat)))
    return cost

def batch_gradient_descent(x, y, alpha, learning_rate, num_iter):
    m = x.shape[0]
    y = np.array(y).reshape(-1, 1)
    theta = np.zeros((x.shape[1], 1))
    for i in range(num_iter):
        z = x @ alpha
        h = sigmoid(z)
        cost = log_loss(y, h)
        gradient = (1 / m) * x.T @ np.subtract(h, y)
        alpha = alpha - learning_rate * gradient
    return alpha


In [26]:
# Training the model and getting the optimal alpha
alpha = np.zeros((x_train.shape[1], 1))
learning_rate = 0.01
num_iter = 1000
optimal_alpha = batch_gradient_descent(x_train, y_train, alpha, learning_rate, num_iter)
optimal_alpha


overflow encountered in exp


divide by zero encountered in log



array([[-4.55604837e-02],
       [ 7.19481228e+02],
       [-5.34885880e+02]])

In [27]:
# Predicting on validation set
pre_results = sigmoid(x_val @ optimal_alpha)
y_pred = np.where(pre_results > 0.5, 1, 0)

# Computing accuracy
accuracy_score(y_val, y_pred)


overflow encountered in exp



0.5579446064139941

## Model 4: Naïve Bayes 2-class classification

In [28]:
# building frequency dictionary using train_df
freqs = build_freqs(train_df['OriginalTweet'], train_df['labels'])

# computing log prior
log_prior = np.log(len(train_df[train_df['labels'] == 1]) / len(train_df[train_df['labels'] == 0]))

# computing log likelihood using laplace smoothing
def compute_log_likelihood(freqs, word, label):
    n = freqs.get((word, label), 0)
    d = sum([freqs.get((word, label), 0) for label in [0, 1]])
    return np.log((n + 1) / (d + 2))

# computing log likelihood for each word in the vocabulary
def compute_log_likelihoods(freqs):
    log_likelihoods = {}
    for word in freqs.keys():
        log_likelihoods[word] = compute_log_likelihood(freqs, word[0], word[1])
    return log_likelihoods

log_likelihoods = compute_log_likelihoods(freqs)

# implementing naive bayes classifier
def naive_bayes_predict(tweet, log_prior, log_likelihoods):
    word_l = process_tweet(tweet)
    p = 0
    p += log_prior
    for word in word_l:
        if (word, 1) in log_likelihoods:
            p += log_likelihoods[(word, 1)]
        if (word, 0) in log_likelihoods:
            p -= log_likelihoods[(word, 0)]
    return p

# predicting on validation set
y_pred = []
for tweet in val_df['OriginalTweet']:
    if naive_bayes_predict(tweet, log_prior, log_likelihoods) > 0:
        y_pred.append(1)
    else:
        y_pred.append(0)

# computing accuracy
accuracy_score(y_val, y_pred)



0.718172983479106

# Training best model on train_df and val_df and computing accuracy on test_df to get generalization benchmark

In [29]:
len(train_df), len(val_df)

(32925, 8232)

In [30]:
# implementing naivee bayes on train_df and val_df
train_df = train_df.append(val_df)
len(train_df)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



41157

In [31]:
# building frequency dictionary using train_df
freqs = build_freqs(train_df['OriginalTweet'], train_df['labels'])

# computing log prior
log_prior = np.log(len(train_df[train_df['labels'] == 1]) / len(train_df[train_df['labels'] == 0]))

# computing log likelihood using laplace smoothing
log_likelihoods = compute_log_likelihoods(freqs)

# predicting on test set
y_pred = []
for tweet in test_df['OriginalTweet']:
    if naive_bayes_predict(tweet, log_prior, log_likelihoods) > 0:
        y_pred.append(1)
    else:
        y_pred.append(0)


In [32]:
# computing accuracy   
accuracy_score(test_df['labels'], y_pred)

0.7101105845181674

- This score can be considered as a benchmark of our model's capacity to generalize predictions.
- Next steps would be to try vector spaces model or to fine tune scikit learn's logistic regression using grid search in order to reach a better accuracy.