# What is the sentiment around COVID tweets?

In [1]:
import html
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
#import seaborn as sns
import string

from io import StringIO
from nltk.corpus import stopwords 
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split  
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder

#nltk.download('stopwords')

In [2]:
# Load the Data
covidDat = pd.read_csv("Corona_NLP_test.csv")[["OriginalTweet", "Sentiment"]]
covidDat

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


# Data Pre-Processing

In [4]:
# initialize stopwords
sw = set(stopwords.words("english")) 

# a function that cleans text and removes stop words
def clean(text, stopwords):
    # remove tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text) # split text on whitespace
    text_list = text.split()
    text_words = []
    
    punctuation = set(string.punctuation)
    punctuation.add("”")
    punctuation.add("“")
    punctuation.add("‘")
    punctuation.add("’")
    punctuation.add(",”")
    
    # keep #tags and @mentions
    ## punctuation.remove("#")
    ## punctuation.remove("@")
    
    for word in text_list:
        # remove punctuation marks at the beginning of each word
        while len(word) > 0 and word[0] in punctuation:
            word = word[1:]
            
        # remove punctuation marks at the end of each word
        while len(word) > 0 and word[-1] in punctuation: 
            word = word[:-1]
            
        # a rule to eliminate most urls
        if len(word) > 0 and "/" not in word: 
            # eliminate stopwords
            if word.lower() not in stopwords:
                # append the word to the text_words list
                text_words.append(word.lower()) 
        cleaner_text = " ".join(text_words)
    return cleaner_text

In [5]:
# Clean the Data

# condense the Sentiment column
def change_sen(sentiment):
    if sentiment == "Extremely Positive":
        return 'positive'
    elif sentiment == "Extremely Negative":
        return 'negative'
    elif sentiment == "Positive":
        return 'positive'
    elif sentiment == "Negative":
        return 'negative'
    else:
        return 'netural'
    
covidDat["Sentiment"] = covidDat["Sentiment"].apply(lambda x:change_sen(x))

# encode the Sentiment column
#le = LabelEncoder()
#covidDat['Sentiment'] = covidDat['Sentiment'].apply(lambda x:change_sen(x))

print(covidDat["OriginalTweet"].iloc[0])

covidDat["OriginalTweet"] = covidDat["OriginalTweet"].apply(lambda x: clean(x, stopwords=sw))
                                                           
# select only text with more than 60 words for training
covidDat = covidDat[covidDat["OriginalTweet"].str.len() > 60]

TRENDING: New Yorkers encounter empty supermarket shelves (pictured, Wegmans in Brooklyn), sold-out online grocers (FoodKick, MaxDelivery) as #coronavirus-fearing shoppers stock up https://t.co/Gr76pcrLWh https://t.co/ivMKMsqdT1


In [6]:
covidDat["OriginalTweet"].iloc[0]

'trending new yorkers encounter empty supermarket shelves pictured wegmans brooklyn sold-out online grocers foodkick maxdelivery coronavirus-fearing shoppers stock'

In [7]:
covidDat.shape

(3581, 2)

# Data Splitting

In [8]:
# Split the Data

X_train, X_test, y_train, y_test = train_test_split(covidDat["OriginalTweet"], 
                                                    covidDat["Sentiment"], 
                                                    test_size = .3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2506,)
(1075,)
(2506,)
(1075,)


# Model Building and Evaluation

In [9]:
# Transform the Data

tfidf = TfidfVectorizer(ngram_range=(1,2), 
                        stop_words="english", 
                        min_df=10)
X_train_tfidf= tfidf.fit_transform(X_train)

In [10]:
# Build a Model

#construct and fit a model
mnb = MultinomialNB()
mnb = mnb.fit(X_train_tfidf, y_train)

# make a prediction on training set
print("Prediction: ", mnb.predict(X_train_tfidf))

# compute accuracy on training set
mnb.score(X_train_tfidf, y_train)

Prediction:  ['netural' 'netural' 'netural' ... 'netural' 'netural' 'netural']


1.0

In [11]:
# Evaluate the Model

X_test_tfidf = tfidf.transform(X_test)

mnb.score(X_test_tfidf, y_test)

1.0

In [12]:
# Tune the Model

#Cross Validation
scores = cross_val_score(estimator = MultinomialNB(), 
                         X = X_train_tfidf, y = y_train, cv=5) 
print("Average cross validation score: ", scores.mean())
print("Standard deviation of cross validation scores: ", scores.std())

#Grid Search Cross Validation
pipe = Pipeline([("tfidf",TfidfVectorizer(stop_words="english")), 
                 ("mnb", MultinomialNB())])
param_grid = [{"tfidf__min_df":[5, 20],
               "tfidf__ngram_range":[(1, 1), (1, 2), (1, 3)]}]

grid = GridSearchCV(estimator=pipe , param_grid =param_grid, cv=5) 
grid.fit(X_train, y_train)

print("The best parameters include: ", grid.best_params_)
print("Training Accuracy: ", grid.score(X_train, y_train))
print("Testing Accuracy: ", grid.score(X_test, y_test))

Average cross validation score:  1.0
Standard deviation of cross validation scores:  0.0
The best parameters include:  {'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 1)}
Training Accuracy:  1.0
Testing Accuracy:  1.0


# Conclusion and Summary

## Mini-Project Report

### Problem to address:  
##### To explore the sentiment of tweets about COVID

### Description of the data:
##### 3798 tweets which originally contained 6 columns but was reduced to just two columns--the original tweet and the sentiment.  By eliminating tweets with less than 60 words, the number of tweets was reduced to 3581.  This set of data was downloaded from Kaggle and the information included the fact that the originator hand-coded the dependent variable.

### The choice of algorithm: 
##### The Multinomial Naive Bayes algorithm was chosen because it is used for text classification when data is represented as feature vectors

### The performance of the algorithm: 
##### The algorithm had a 71% accuracy rate on the training set.  It had a 61% accuracy on the test data.

### Overfitting? 
##### Because the model is not that good, there is probabily underfitting.  The overall prediction is pretty low.  However if the accuracy is high enough, then there is some overfitting because the accuracy of model on the training set is better than the accuracy on the test data.

### Choice of hyperparameters tunned: 
##### "tfidf__min_df":[5, 20] and "tfidf__ngram_range":[(1, 1), (1, 2), (1, 3) were chosen and the best parameters were: 'tfidf__min_df': 5 and 'tfidf__ngram_range': (1, 1) .

### Recommendation/Conclusion: 
##### Because of the text based nature of this data the Multinomial Naive Bayes is a good choice for the model.  Better accuracy might be achieved with different parameters.  Also, it should be noted that the author of this data included that he hand scored the prediction variable.  They could be the reason for the lower accuracy--human error!