<a href="https://colab.research.google.com/github/Jennylin331431/JSC270_NLP_Project/blob/main/Part_I_Sentiment_Analysis_with_a_Twitter_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial data import and cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import random
import math
import io

  import pandas.util.testing as tm


In [None]:
# Import data
from google.colab import files
test = files.upload()
train= files.upload()


In [None]:
test_data = pd.read_csv(io.BytesIO(test['covid-tweets-test.csv']), sep = ',')
train_data = pd.read_csv(io.BytesIO(train['covid-tweets-train.csv']), sep = ',')

In [None]:
# check to see if Sentiment values are 0 and 1
print(train_data.value_counts("Sentiment"))
print(test_data.value_counts("Sentiment"))

In [None]:
# Drop the Sentiment values that are not 0, 1, or 2
train_data.drop(train_data.loc[(train_data['Sentiment']== ' PA"') | (train_data['Sentiment'] == ' England"')].index, inplace=True)

In [None]:
# Drop N/A values
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
# Convert selected columns to floats
train_data['Sentiment'] = train_data['Sentiment'].astype('float64')
test_data['Sentiment'] = test_data['Sentiment'].astype('float64')

In [None]:
# check that there are no null values
train_data.isnull().sum()

In [None]:
train_data.head()

# Part (A)

(1 pt) Consider the training data. What is the balance between the three classes? In other words, what proportion of the observations (in the training set) belong to each class?

In [None]:
train_data.value_counts('Sentiment')

In [None]:
# get balance between three classes
total =  18042 + 15397 + 7712
print("Sentiment 0 proportion: ", 15397/total)
print("Sentiment 1 proportion: ", 7712/total)
print("Sentiment 2 proportion: ", 18042/total)

# Part (B)

(1 pt) Tokenize the tweets. In other words, for each observation, convert the tweet from a single string of running text into a list of individual tokens (possibly with punctuation), splitting on whitespace. The result should be that each observation (tweet) is a list of individual tokens.


In [None]:
import nltk
# Download the tokenizer
nltk.download('punkt')

# Create a new column in our DF that contains token lists instead of raw text
train_data['tokens'] = train_data['OriginalTweet'].apply(nltk.word_tokenize)

test_data['tokens'] = test_data['OriginalTweet'].apply(nltk.word_tokenize)

print(train_data['tokens'].head(5))
print(test_data['tokens'].head(5))

# Part (C)

(1 pt) Using a regular expression, remove any URL tokens from each of the observations.

In [None]:
import re
##### Remove URL tokens #####

def remove_URL_tokens(data, column):
  tokens_no_URL = []
  for row in data[column]:
    tokens_no_URL.append([re.sub(r'http\S+','', t) for t in row])
  data[column] = tokens_no_URL


In [None]:
# Remove URL tokens from testing and training data
remove_URL_tokens(test_data, 'tokens')
remove_URL_tokens(train_data, 'tokens')

print(train_data['tokens'].head(5))
print(test_data['tokens'].head(5))

# Part (D)

Remove all punctuation (,.?!;:’") and special characters(@, #, +, &, =, $, etc). Also, convert all tokens to lowercase only. Can you think of a scenario when you might want to keep some forms of punctuation?

In [None]:
##### Convert tokens into lowercase ####

def convert_lowercase(data, column):
  lowercase_tokens = []
  for row in data[column]:
    lowercase_tokens.append([t.lower() for t in row])
  data[column] = lowercase_tokens

In [None]:
# Convert tokens in testing and training data to lowercase
convert_lowercase(train_data, 'tokens')
convert_lowercase(test_data, 'tokens')

print(train_data['tokens'].head(5))
print(test_data['tokens'].head(5))

In [None]:
##### Remove punctuation and special characters #####

def remove_special_char(data, column):
  tokens_no_punct = []
  for row in data[column]:
    tokens_no_punct.append([re.sub('[^\w\s]','', t) for t in row])
  data[column] = tokens_no_punct

In [None]:
# Remove all punctuation and special characters in testing and training data
remove_special_char(train_data, 'tokens')
remove_special_char(test_data, 'tokens')

print(train_data['tokens'].head(5))
print(test_data['tokens'].head(5))

In [None]:
#### Remove empty tokens ####
def remove_empty_tokens(data, column):
  tokens_no_empty = []
  for row in data[column]:
    tokens_no_empty.append([w for w in row if (w != '')])
  data[column] = tokens_no_empty

In [None]:
# Remove the empty tokens from the testing and training dataset
remove_empty_tokens(train_data, 'tokens')
remove_empty_tokens(test_data, 'tokens')

print(train_data['tokens'].head(5))
print(test_data['tokens'].head(5))

# Part (E)

Now stem your tokens. This will have the effect of converting similar word forms into identical tokens (e.g. run, runs, running → run). Please specify which stemmer you use.

In [None]:
# Save current tokens in new row for later (Part K)
train_data['tokens_k'] = train_data['tokens']
test_data['tokens_k'] = test_data['tokens']

In [None]:
#### Stemming tokens ####
from nltk.stem.porter import *

stemmer = PorterStemmer()

def stem_tokens(data, column):
  stemmed_tokens = []
  for row in data[column]:
    stemmed_tokens.append([stemmer.stem(t) for t in row])

  data[column] = stemmed_tokens

In [None]:
# Stem the tokens in the training and testing dataset
stem_tokens(train_data, 'tokens')
stem_tokens(test_data, 'tokens')

# Print results
print('After stemming:\n', train_data['tokens'].head(3))
print('After stemming:\n', test_data['tokens'].head(3))

# Part (F)

Lastly, remove stopwords. Using the english stopwords list from nltk, remove these common words from your observations. This list is very long (I think almost 200 words), so remove only the first 100 stopwords in the list.



In [None]:
# Import stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
# print the top 100 most popular english words
sw = stopwords.words('english')[:100]

print(sw)

In [None]:
##### Remove Stopwords #####
def remove_stopwords(data, column):
  tokens_no_sw = []
  for row in data[column]:
    tokens_no_sw.append([w for w in row if w not in sw])
  data[column] = tokens_no_sw

In [None]:
# Remove stopwords from the training and testing dataset
remove_stopwords(train_data, 'tokens')
remove_stopwords(test_data, 'tokens')

print(train_data['tokens'].tail(5))
print(test_data['tokens'].tail(5))

# Part (G)

Now convert your lists of words into vectors of word counts. You may find Scikit-learn’s CountVectorizer useful here. What is the length of your vocabulary?


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Separate labels from features, converting to numpy arrays
X_train, y_train = train_data['tokens'].to_numpy(), train_data['Sentiment'].to_numpy()
X_test, y_test = test_data['tokens'].to_numpy(), test_data['Sentiment'].to_numpy()

def override_fcn(doc):
  # Expect a list of tokens as input
  return doc

# Count Vectorizer
count_vec = CountVectorizer(
    analyzer='word',
    tokenizer= override_fcn,
    preprocessor= override_fcn,
    token_pattern= None,
    max_features = 1000)

# Output is a Scipy Sparse Array
counts_train = count_vec.fit_transform(X_train)
print(counts_train.toarray())
# Use the same words for the testing dataset
counts_test = count_vec.transform(X_test)
print(counts_test.toarray())

# Print the names of each of the features (1000 total))
print("Length of vocabulary: ", len(count_vec.vocabulary_))
# Print this mapping as dictionary
print(count_vec.vocabulary_)

## Which row represents 'great'
print('\nGreat is located at row: ',count_vec.vocabulary_['great'])

In [None]:
# Get shape of feature matrix (see number of features)
feature_matrix = counts_train.toarray()
feature_matrix.shape # (number of tweets, numer of features)

# Part (H)

(4 pts) Recall the definition of the Naive Bayes model. If each document (tweet) is a collection of words (w1, · · · , wN ) belonging to class Ck (k = 0, 1, 2), then the Naive Bayes approach models the probability of each tweet belonging to class k:

The last equality follows from our “naive” assumption that words are conditionally independent given class. The probabilities are estimated using the frequencies of words within each class (bag of words), and we assign the class label according to which of the 3 posterior class probabilities (P(Ck|w1,··· ,wN)) is the highest.

Fit a Naive Bayes model to your data. Report the training and test error of the model. Use accuracy as the error metric. Also, report the 5 most probable words in each class, along with their counts. You might find Scikit-learn’s MultinomialNB() transformer useful. Use Laplace smoothing to prevent probabilities of zero.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve
import matplotlib.pyplot as plt

# Fit the Naive Bayes model to our training data
nb = MultinomialNB()
# Fit model to training data (default alpha = 1 has Laplace smoothing)
nb.fit(counts_train.toarray(), y_train)

# Predict on test data
y_preds_test = nb.predict(counts_test.toarray())

# Predict on train data
y_preds_train = nb.predict(counts_train.toarray())


print('Test accuracy with simple Naive Bayes:',accuracy_score(y_test,y_preds_test))
print('Training accuracy with simple Naive Bayes:',accuracy_score(y_train,y_preds_train))

In [None]:
nb_1 = MultinomialNB(fit_prior=False)
# Fit model to training data (default alpha = 1 has Laplace smoothing)
nb.fit(counts_train.toarray(), y_train)

# Predict on test data
y_preds_test = nb.predict(counts_test.toarray())

In [None]:
from sklearn.metrics import *

fpr, tpr, thresholds = roc_curve(y_test, y_preds_test, 
                                 pos_label = 1)

asthma_auroc = roc_auc_score(y_test, y_preds_test)

print(thresholds)
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Asthma Model Receiver Operator Characteristic (ROC): ' + str(round(asthma_auroc,5)))
plt.show()

In [None]:
# Function to get most probable word for a class
def most_probable_word_for_class(vectorizer, classifier, classlabel, n):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    topn = sorted(zip(classifier.feature_count_[classlabel][:], feature_names))[-n:]
    for coef, feat in topn:
        print("word: ", feat, "count: ", coef)  

In [None]:
print("Five most probable words for negative sentiment")
most_probable_word_for_class(count_vec, nb, 0, n=5)

print("Five most probable words for neutral sentiment")
most_probable_word_for_class(count_vec, nb, 1, n=5)

print("Five most probable words for positive sentiment")
most_probable_word_for_class(count_vec, nb, 2, n=5)

# Part (I)

(2 pts) Would it be appropriate to fit an ROC curve in this scenario? If yes, explain why. If no, explain why not.

# Part (J)

(2 pts) Redo parts G-H using TF-IDF vectors instead of count vectors. You might find Scikitlearn’s TfidfVectorizer() transformer useful. Report the training and test accuracy. How does this compare to the accuracy using count vectors?

## (J.G)

Note the count vectorizer has already been run

TfidfVectorizer is equivalent to CountVectorizer followed by TfidfTransformer.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
#### TF-IDF Vectorize ####

# Note that smoothing is done by default
tfidf = TfidfTransformer()

tfs_train = tfidf.fit_transform(counts_train)
tfs_test = tfidf.transform(counts_test)

# Use the TFIDF counts for modelling
X_train = tfs_train.toarray()
X_test = tfs_test.toarray()


## (J.H)

In [None]:
# Let's fit the Naive Bayes model to our training data
nb = MultinomialNB()
# Fit model to training data
nb.fit(X_train, y_train)

# Predict on test data
y_preds_test = nb.predict(X_test)

# Predict on train data
y_preds_train = nb.predict(X_train)


print('Test accuracy with simple Naive Bayes:',accuracy_score(y_test,y_preds_test))
print('Training accuracy with simple Naive Bayes:',accuracy_score(y_train,y_preds_train))

# Part (K)

(3 pts) Recall lemmatization converts each word to its base form, which is a bit stronger than simply taking the stem. Redo parts E-H using TF-IDF vectors instead of count vectors. This time use lemmatization instead of stemming. Report train and test accuracy. How does the accuracy with lemmatization compare to the accuracy with stemming?

## (K.E)

In [None]:
#### Lemmatize ####
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(data, column):
  lem_tokens = []
  for row in data[column]:
    lem_tokens.append([lemmatizer.lemmatize(t) for t in row])
  data[column] = lem_tokens

In [None]:
#Lemmatize the training and testing data (using the copy of the column we saved from part E)
lemmatize_tokens(train_data, 'tokens_k')
lemmatize_tokens(test_data, 'tokens_k')

print(train_data['tokens_k'].head(5))
print(test_data['tokens_k'].head(5))

## (K.F)

In [None]:
# Remove stopwords
remove_stopwords(train_data, 'tokens_k')
remove_stopwords(test_data, 'tokens_k')

print(train_data['tokens_k'].tail(5))
print(test_data['tokens_k'].tail(5))

## (K.G)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Separate labels from features, converting to numpy arrays
X_train, y_train = train_data['tokens_k'].to_numpy(), train_data['Sentiment'].to_numpy()
X_test, y_test = test_data['tokens_k'].to_numpy(), test_data['Sentiment'].to_numpy()

def override_fcn(doc):
  # We expect a list of tokens as input
  return doc

# Count Vectorizer
count_vec = CountVectorizer(
    analyzer='word',
    tokenizer= override_fcn,
    preprocessor= override_fcn,
    token_pattern= None,
    max_features = 1000)

# Output is a Scipy Sparse Array
counts_train = count_vec.fit_transform(X_train)
print(counts_train.toarray())
# Use the same words for the testing dataset
counts_test = count_vec.transform(X_test)
print(counts_test.toarray())

# Print the names of each of the features (1000 total))
print(count_vec.get_feature_names())
# Print this mapping as dictionary
print(count_vec.vocabulary_)

## Which row represents 'great'
print('\nGreat is located at row: ',count_vec.vocabulary_['great'])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
#### TF-IDF Vectorize ####

# Note that smoothing is done by default
tfidf = TfidfTransformer()

tfs_train = tfidf.fit_transform(counts_train)
tfs_test = tfidf.transform(counts_test)

# Use the TFIDF counts for modelling
X_train = tfs_train.toarray()
X_test = tfs_test.toarray()


## (K.H)

In [None]:
# Fit the Naive Bayes model to our training data
nb = MultinomialNB()
# Fit model to training data
nb.fit(X_train, y_train)

# Predict on test data
y_preds_test = nb.predict(X_test)

# Predict on train data
y_preds_train = nb.predict(X_train)


print('Test accuracy with simple Naive Bayes:',accuracy_score(y_test,y_preds_test))
print('Training accuracy with simple Naive Bayes:',accuracy_score(y_train,y_preds_train))