<a href="https://colab.research.google.com/github/JanisJ2/jsc270-a4/blob/main/JSC270_Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Github link: https://github.com/JanisJ2/jsc270-a4 \
Group member:
- Christoffer Tan (1008740445)
- Janis Joplin (10097515051)


In [1]:
# Import necessary modules
from google.colab import files
import io
import sys
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve

from scipy.cluster.hierarchy import dendrogram, linkage

import seaborn as sns

import nltk
nltk.download('punkt')  # Tokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Read the dataset from GitHub
!wget "https://raw.githubusercontent.com/JanisJ2/jsc270-a4/main/covid-tweets-train.csv"
!wget "https://raw.githubusercontent.com/JanisJ2/jsc270-a4/main/covid-tweets-test.csv"

--2024-03-30 19:37:46--  https://raw.githubusercontent.com/JanisJ2/jsc270-a4/main/covid-tweets-train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8991364 (8.6M) [text/plain]
Saving to: ‘covid-tweets-train.csv’


2024-03-30 19:37:46 (52.6 MB/s) - ‘covid-tweets-train.csv’ saved [8991364/8991364]

--2024-03-30 19:37:46--  https://raw.githubusercontent.com/JanisJ2/jsc270-a4/main/covid-tweets-test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 866735 (846K) [text/plain]
Saving to: ‘covid-tweets-test.csv’


2024-03-30 19:37:

In [3]:
# Read the train data from CSV file
train_data = pd.read_csv('covid-tweets-train.csv')
train_data.columns = ['Label', 'Message', 'Sentiment']
train_data.head()

Unnamed: 0,Label,Message,Sentiment
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1
1,1,advice Talk to your neighbours family to excha...,2
2,2,Coronavirus Australia: Woolworths to give elde...,2
3,3,My food stock is not the only one which is emp...,2
4,4,"Me, ready to go at supermarket during the #COV...",0


In [4]:
def filter_missing_values(data):
    data.dropna(inplace=True)

In [5]:
# Clean missing values from train_data
filter_missing_values(train_data)
# Take only rows with sentiments either 0, 1, or 2
train_data = train_data[train_data['Sentiment'].isin(['0', '1', '2'])]

# Part I

## a)

In [6]:
# Given the a data set, it will return the proportion of the observations belong the a sentiment type
def proportion_of_sentiment(data, sentiment_type):
    return (data['Sentiment'] == sentiment_type).sum() / len(data['Sentiment'])

In [7]:
for i in range(3):
    print(f"The proportion of the train observations belonging to sentiment type {i}: {proportion_of_sentiment(train_data, str(i))}")

The proportion of the train observations belonging to sentiment type 0: 0.37415858666861074
The proportion of the train observations belonging to sentiment type 1: 0.18740735340574954
The proportion of the train observations belonging to sentiment type 2: 0.4384340599256397


## b)

In [8]:
def tokenize(data):
    data['Tokens'] = data['Message'].apply(nltk.word_tokenize)

In [9]:
tokenize(train_data)
train_data['Tokens'].head()

0    [@, MeNyrbie, @, Phil_Gahan, @, Chrisitv, http...
1    [advice, Talk, to, your, neighbours, family, t...
2    [Coronavirus, Australia, :, Woolworths, to, gi...
3    [My, food, stock, is, not, the, only, one, whi...
4    [Me, ,, ready, to, go, at, supermarket, during...
Name: Tokens, dtype: object

## c)

In [10]:
def remove_url(data):
    tokens_no_url = []

    # for the tokens of each row, remove all occurences of url
    for row in data['Tokens']:
        tokens_no_url.append([re.sub('^http', '', t) for t in row])

    # Replace our tokens with the url-removed version
    data['Tokens'] = tokens_no_url

In [11]:
remove_url(train_data)
train_data['Tokens'].head()

0    [@, MeNyrbie, @, Phil_Gahan, @, Chrisitv, s, :...
1    [advice, Talk, to, your, neighbours, family, t...
2    [Coronavirus, Australia, :, Woolworths, to, gi...
3    [My, food, stock, is, not, the, only, one, whi...
4    [Me, ,, ready, to, go, at, supermarket, during...
Name: Tokens, dtype: object

## d)

In [12]:
def remove_punctuation(data):
    tokens_no_punct = []
    # For the tokens of each row, remove all occurrences of punctuations (i.e. non-alphanumeric and non-whitespace)
    for row in data['Tokens']:
        tokens_no_punct.append([re.sub('[^\w\s]', '', t) for t in row])
    # Replace our tokens with the punctuation-removed version
    data['Tokens'] = tokens_no_punct

def convert_to_lowercase(data):
    lowercase_tokens = []
    # For the tokens of each row, convert all strings to lowercase
    for row in data['Tokens']:
        lowercase_tokens.append([t.lower() for t in row])
    # Replace our tokens with the lowercase version
    data['Tokens'] = lowercase_tokens

In [13]:
remove_punctuation(train_data)
convert_to_lowercase(train_data)
train_data['Tokens'].head()

0    [, menyrbie, , phil_gahan, , chrisitv, s, , tc...
1    [advice, talk, to, your, neighbours, family, t...
2    [coronavirus, australia, , woolworths, to, giv...
3    [my, food, stock, is, not, the, only, one, whi...
4    [me, , ready, to, go, at, supermarket, during,...
Name: Tokens, dtype: object

## e)

In [14]:
def stemming_tokens(data):
    ### Stemming our dataset using PorterStemmer
    stemmer = PorterStemmer()

    stemmed_tokens = []
    for row in data['Tokens']:
      stemmed_tokens.append([stemmer.stem(t) for t in row])

    data['stemmed_tokens'] = stemmed_tokens

In [15]:
stemming_tokens(train_data)
train_data['stemmed_tokens'].head()

0    [, menyrbi, , phil_gahan, , chrisitv, s, , tco...
1    [advic, talk, to, your, neighbour, famili, to,...
2    [coronaviru, australia, , woolworth, to, give,...
3    [my, food, stock, is, not, the, onli, one, whi...
4    [me, , readi, to, go, at, supermarket, dure, t...
Name: stemmed_tokens, dtype: object

## f)

In [16]:
def remove_stopwords(data, col):
    sw = stopwords.words('english')[:100]
    tokens_no_sw = []
    # For the tokens of each row, remove all occurrences of stopwords
    for row in data[col]:
        tokens_no_sw.append([w for w in row if w not in sw])
    # Replace our tokens with the stopwords-removed version
    data[col] = tokens_no_sw

In [17]:
remove_stopwords(train_data, 'stemmed_tokens')
train_data['Tokens'].head()

0    [, menyrbie, , phil_gahan, , chrisitv, s, , tc...
1    [advice, talk, to, your, neighbours, family, t...
2    [coronavirus, australia, , woolworths, to, giv...
3    [my, food, stock, is, not, the, only, one, whi...
4    [me, , ready, to, go, at, supermarket, during,...
Name: Tokens, dtype: object

In [18]:
# Remove empty strings
def remove_empty_strings(data, col):
    data[col] = [[x for x in lst if x != ''] for lst in data[col]]

In [19]:
remove_empty_strings(train_data, 'stemmed_tokens')

## g)

In [20]:
def override_fcn(doc):
  # We expect a list of tokens as input
  return doc

# Count Vectorizer for training data
count_vec_train = CountVectorizer(
    analyzer='word',
    tokenizer= override_fcn,
    preprocessor= override_fcn,
    token_pattern= None,
    # max_features= 2000
    )

In [21]:
def split_tokens_and_label(data, col):
    return data[col].to_numpy(), data['Sentiment'].to_numpy()

def convert_to_vector_fit(X):
    # Remember this output is a Scipy Sparse Array
    counts = count_vec_train.fit_transform(X)
    # print(counts.toarray())

    # # Print this mapping as dictionary
    # print(count_vec_train.vocabulary_)

    # Print the length of the vocabulary
    print(f'The length of the vocabulary is {len(count_vec_train.vocabulary_)}')
    return counts

def convert_to_vector_transform(X):
    # Remember this output is a Scipy Sparse Array
    counts = count_vec_train.transform(X)
    # print(counts.toarray())

    # # Print this mapping as dictionary
    # print(count_vec_train.vocabulary_)

    # Print the length of the vocabulary
    print(f'The length of the vocabulary is {len(count_vec_train.vocabulary_)}')
    return counts

In [22]:
X_train, y_train = split_tokens_and_label(train_data, 'stemmed_tokens')
counts = convert_to_vector_fit(X_train)
X_train = counts

The length of the vocabulary is 74221


## h) `train_data`


In [23]:
def fit_naive_bayes_model(X_data, y_data):
    nb = MultinomialNB()
    # Fit model to the data
    nb.fit(X_data, y_data)
    return nb

In [24]:
nb_train = fit_naive_bayes_model(X_train, y_train)
y_train_preds = nb_train.predict(X_train)
print(f'Test accuracy with simple Naive Bayes on training data:', accuracy_score(y_train, y_train_preds))
# predict(X_train, y_train, 'training')

Test accuracy with simple Naive Bayes on training data: 0.8200529756263517


In [25]:
# Report the 5 most probable words in each class, along with their counts.
num_top_words = 5
# Get the probabilities of each word given each class
word_probs = nb_train.feature_log_prob_  # Log probabilities of features given a class

# Reverse the mapping from feature indices to words
feature_names = count_vec_train.get_feature_names_out()

# Get the top 5 most probable words for each class
sentiments = ['Negative', 'Neutral', 'Positive']
num_top_words = 5
for i, class_probs in enumerate(word_probs):
    print(f"Class {i} ({sentiments[i]}):")
    top_word_indices = class_probs.argsort()[-num_top_words:][::-1]  # Indices of top words
    for idx in top_word_indices:
        word = feature_names[idx]
        count = count_vec_train.vocabulary_[word]  # Convert log probability back to count
        print(f"   {word}: {count}")

Class 0 (Negative):
   s: 38119
   coronaviru: 11219
   covid19: 11686
   price: 35084
   food: 17726
Class 1 (Neutral):
   s: 38119
   coronaviru: 11219
   covid19: 11686
   store: 42147
   supermarket: 42621
Class 2 (Positive):
   s: 38119
   coronaviru: 11219
   covid19: 11686
   store: 42147
   thi: 67860


## h) `test_data`

In [26]:
# Read the test data from CSV file
test_data = pd.read_csv('covid-tweets-test.csv')
test_data.columns = ['Label', 'Message', 'Sentiment']
test_data["Sentiment"] = test_data["Sentiment"].astype(str)
# Test our model by doing the same analysis to test_data
filter_missing_values(test_data)
tokenize(test_data)
remove_url(test_data)
remove_punctuation(test_data)
convert_to_lowercase(test_data)
stemming_tokens(test_data)
remove_stopwords(test_data, 'stemmed_tokens')
remove_empty_strings(test_data, 'stemmed_tokens')
X_test, y_test = split_tokens_and_label(test_data, 'stemmed_tokens')
X_test = convert_to_vector_transform(X_test)
# predict(X_test, y_test, 'test')
# nb_test = fit_naive_bayes_model(X_test, y_test)
y_test_preds = nb_train.predict(X_test)
print(f'Test accuracy with simple Naive Bayes on test data:', accuracy_score(y_test, y_test_preds))

The length of the vocabulary is 74221
Test accuracy with simple Naive Bayes on test data: 0.6695629278567667


# j)

In [27]:
def tfidf_transformer(X):
    tfidf = TfidfTransformer()

    tfs = tfidf.fit_transform(X);

    return tfs.toarray()

In [28]:
### Build the model based on train data ###
X_train = tfidf_transformer(counts)

nb_train = fit_naive_bayes_model(X_train, y_train)
y_train_preds = nb_train.predict(X_train)
print(f'Test accuracy with simple Naive Bayes on training data:', accuracy_score(y_train, y_train_preds))

Test accuracy with simple Naive Bayes on training data: 0.7245510437170422


In [29]:
### Train data ###
y_test_preds = nb_train.predict(X_test)
print(f'Test accuracy with simple Naive Bayes on test data:', accuracy_score(y_test, y_test_preds))

Test accuracy with simple Naive Bayes on test data: 0.6332280147446024


# k)

In [30]:
def lemmatize_tokens(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for row in data['Tokens']:
        lemmatized_tokens.append([lemmatizer.lemmatize(t) for t in row])

    data['lemmatized_tokens'] = lemmatized_tokens

In [31]:
### Train data ###
lemmatize_tokens(train_data)
remove_stopwords(train_data, 'lemmatized_tokens')
remove_empty_strings(train_data, 'lemmatized_tokens')
X_train, y_train = split_tokens_and_label(train_data, 'lemmatized_tokens')
counts = convert_to_vector_fit(X_train)
X_train = counts

nb_train = fit_naive_bayes_model(X_train, y_train)
y_train_preds = nb_train.predict(X_train)
print(f'Test accuracy with simple Naive Bayes on training data:', accuracy_score(y_train, y_train_preds))

The length of the vocabulary is 80978
Test accuracy with simple Naive Bayes on training data: 0.8345848217540278


In [32]:
### Test data ###
lemmatize_tokens(test_data)
remove_stopwords(test_data, 'lemmatized_tokens')
remove_empty_strings(test_data, 'lemmatized_tokens')
X_test, y_test = split_tokens_and_label(test_data, 'lemmatized_tokens')
X_test = convert_to_vector_transform(X_test)

y_test_preds = nb_train.predict(X_test)
print(f'Test accuracy with simple Naive Bayes on test data:', accuracy_score(y_test, y_test_preds))

The length of the vocabulary is 80978
Test accuracy with simple Naive Bayes on test data: 0.6727224855186941


In [33]:
# Report the 5 most probable words in each class, along with their counts.
num_top_words = 5
# Get the probabilities of each word given each class
word_probs = nb_train.feature_log_prob_  # Log probabilities of features given a class

# Reverse the mapping from feature indices to words
feature_names = count_vec_train.get_feature_names_out()

# Get the top 5 most probable words for each class
sentiments = ['Negative', 'Neutral', 'Positive']
num_top_words = 5
for i, class_probs in enumerate(word_probs):
    print(f"Class {i} ({sentiments[i]}):")
    top_word_indices = class_probs.argsort()[-num_top_words:][::-1]  # Indices of top words
    for idx in top_word_indices:
        word = feature_names[idx]
        count = count_vec_train.vocabulary_[word]  # Convert log probability back to count
        print(f"   {word}: {count}")

Class 0 (Negative):
   s: 43440
   coronavirus: 12545
   covid19: 13065
   price: 39670
   food: 20290
Class 1 (Neutral):
   s: 43440
   coronavirus: 12545
   covid19: 13065
   store: 48023
   supermarket: 48602
Class 2 (Positive):
   s: 43440
   coronavirus: 12545
   covid19: 13065
   store: 48023
   supermarket: 48602


# Part II

## Preparing Data

In [34]:
!wget "https://raw.githubusercontent.com/JanisJ2/jsc270-a4/main/tweets.csv"

--2024-03-30 19:39:41--  https://raw.githubusercontent.com/JanisJ2/jsc270-a4/main/tweets.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1912231 (1.8M) [text/plain]
Saving to: ‘tweets.csv’


2024-03-30 19:39:41 (30.9 MB/s) - ‘tweets.csv’ saved [1912231/1912231]



In [35]:
# Read the data form CSV file
df = pd.read_csv('tweets.csv', index_col = 0)

In [36]:
df.head()

Unnamed: 0,tweet_text,tweet_favourite_count,tweet_created_at,tweet_retweet_count,user_statuses_count,user_screen_name,user_followers_count
0,The priority for the city should be on providi...,0,2023-03-27 21:01:36+00:00,0,10194,EmergencyAgent,1145
1,"Like it or not, this kind of clip is the sort ...",0,2023-03-27 21:01:26+00:00,0,2843,EDenhoff,4630
2,On a scale of moderately conservative (1) to r...,0,2023-03-27 21:01:16+00:00,0,13608,dzoolander85,11234
3,Keep up with the latest politics with The Dail...,0,2023-03-27 21:01:06+00:00,0,3610,VassKapelosShow,4901
4,So much resemblance to NAZI Germany leading up...,0,2023-03-27 21:01:05+00:00,0,108226,marshiehilgs,515


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   tweet_text             10000 non-null  object
 1   tweet_favourite_count  10000 non-null  int64 
 2   tweet_created_at       10000 non-null  object
 3   tweet_retweet_count    10000 non-null  int64 
 4   user_statuses_count    10000 non-null  int64 
 5   user_screen_name       10000 non-null  object
 6   user_followers_count   10000 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 883.0+ KB


## Data Cleaning

In [38]:
# Check if there are any negative values in numeric columns
has_negative = (df['tweet_favourite_count'] < 0).any() and (df['tweet_retweet_count'] < 0).any() and \
 (df['user_statuses_count'] < 0).any() and(df['user_followers_count'] < 0).any()

if (has_negative): print("There's at least one negative value in the dataset.")
else: print("There's no negative value in the dataset.")

There's no negative value in the dataset.


## Text Pre-Processing 1

1. Tokenize, punctutation, lowercase
2. Remove stop words
3. Bigrams

## Exploratory Data Analysis (EDA)

1. Word cloud
2. Print the 10 most frequent words

## Text Pre-Processing 2

1. Lemmatize tokens
6. Create dictionary, corpus, and term document frequency list

## Language Dirichlet Allocation (LDA)

1. Tuning hyperparameters (search for the best number of topics): use validation data set
2. LDA setup (using training set)
3. Visualizing topics (pyLDAvis)

### Performance Evaluation

1. Eyeball
2. Weird technique

## Playground

In [39]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def compute_relevance(word, sentence):
    # Process the sentence
    doc = nlp(sentence)
    return doc.similarity(nlp(word))
    # Compute relevance score for each token in the sentence
    # relevance_scores = []
    # for token in doc:
    #     relevance_score = token.similarity(nlp(word))
    #     relevance_scores.append(relevance_score)

    # # Calculate the average relevance score
    # average_relevance_score = sum(relevance_scores) / len(relevance_scores)

    # return average_relevance_score

sentence = "Okay. Hello, this is X insurance calling regarding a liability decision as well as to verify if there were any injuries in your insurance vehicle. And to confirm if the policy is active on the date of loss for the claim number XXXXXXXX. The claim number is XXXXXXXX. The adjuster could be reached at XXX XXX XXXX. Thank you. And have a nice day"
word = "Other insurance companies"
relevance_score = compute_relevance(word, sentence)
print(relevance_score)

0.2907108899342669


  return doc.similarity(nlp(word))


# Back-up

In [40]:
# bearer_token="AAAAAAAAAAAAAAAAAAAAABjFsgEAAAAAlpTaLAR1wJ2sut2HNk8oY2r9u28%3DRWpfy37hyg15PEYEQJAQwhjS9S3RasSkc1WV35KH1FhBPfAgjF"


# import tweepy as tw

# client = tw.Client(bearer_token=bearer_token)


# # ONLY RUN THIS CELL ONCE IN LAB SO AVOID GOING OVER THE RATE LIMIT FOR THIS ACCOUNT!

# #Collect tweets (here, I get only 20)
# search_words = '#toronto'

# response = client.search_recent_tweets(search_words, max_results=100)
# tweets = response.data


# print(f'Number of tweets: {len(tweets)}')

# # The result is an iterable
# for tweet in tweets:
#   print(tweet.text)

# # Could also use a list comprehension


# tweet_list = [tweet.text for tweet in tweets]
# tweet_text = pd.DataFrame(tweet_list, columns = ['tweet'])

# print(tweet_text.head(5))


# # Extract handles
# handle_regex = '@[A-Za-z|0-9]+'

# tweet_text['handles'] = tweet_text['tweet'].str.findall(handle_regex)
# print(tweet_text.head(10))


# # Define the file path to save the DataFrame
# file_path = 'twitter_data.txt'

# # Write the DataFrame to a text file
# tweet_text.to_csv(file_path, sep='\t', index=False)

# print(f"DataFrame has been successfully written to '{file_path}'.")