In [19]:
import pandas as pd

import spacy
from scispacy.abbreviation import AbbreviationDetector

from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

import html
import emoji
from googletrans import Translator

from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from database import database_manager as dbm

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/raimuu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**Extract**

In [20]:
file_path = "../../data/raw/COVIDSenti.zip"
df = pd.read_csv(file_path, compression='zip')
new_df = df.head()
new_df.to_csv("testing")
display(df)

Unnamed: 0,tweet,label
0,Coronavirus | Human Coronavirus Types | CDC ht...,neu
1,"@shehryar_taseer That‚Äôs üíØ true , \nCorona...",neu
2,"TLDR: Not SARS, possibly new coronavirus. Diff...",neg
3,Disease outbreak news from the WHO: Middle Eas...,neu
4,China - Media: WSJ says sources tell them myst...,neu
...,...,...
89995,@C_Racing48 The flu has a 2% death rate.. the ...,neu
89996,@realDonaldTrump We already know that but you‚...,neg
89997,First coronavirus case reported in St. Joseph ...,neu
89998,"If you ate ants when you were a child, you‚Äôr...",neu


**Transform**

In [21]:
# remove unwanted columns
columns_to_rename = {"tweet": "text", "label": "labelled_sentiment"}
df = df.rename(columns=columns_to_rename)
display(df)


Unnamed: 0,text,labelled_sentiment
0,Coronavirus | Human Coronavirus Types | CDC ht...,neu
1,"@shehryar_taseer That‚Äôs üíØ true , \nCorona...",neu
2,"TLDR: Not SARS, possibly new coronavirus. Diff...",neg
3,Disease outbreak news from the WHO: Middle Eas...,neu
4,China - Media: WSJ says sources tell them myst...,neu
...,...,...
89995,@C_Racing48 The flu has a 2% death rate.. the ...,neu
89996,@realDonaldTrump We already know that but you‚...,neg
89997,First coronavirus case reported in St. Joseph ...,neu
89998,"If you ate ants when you were a child, you‚Äôr...",neu


In [22]:

def unescape_html(text): 
    return html.unescape(text)

df_clean = df.dropna().copy() # drop all rows containing null

df_clean['text'] = (
    df_clean['text']
    .str.replace(r'http\S+|www.\S+', '', regex=True) # remove links
    .str.replace(r'[\n\t]', ' ', regex=True) # remove \t\n
    .apply(unescape_html) # escape html formatting
    .apply(emoji.demojize) # replace emojis with their symbolic name
    .str.replace(r'[^\w\s.,!?;:\-()\'"/&]', '', regex=True) # remove non-alphanumeric characters (symbols not in this set)
    .str.replace(":", " ")
    .str.replace("_", " ")
)

# convert columns to appropriate data types
df_clean = df_clean.reset_index(drop=True)
display(df)


Unnamed: 0,text,labelled_sentiment
0,Coronavirus | Human Coronavirus Types | CDC ht...,neu
1,"@shehryar_taseer That‚Äôs üíØ true , \nCorona...",neu
2,"TLDR: Not SARS, possibly new coronavirus. Diff...",neg
3,Disease outbreak news from the WHO: Middle Eas...,neu
4,China - Media: WSJ says sources tell them myst...,neu
...,...,...
89995,@C_Racing48 The flu has a 2% death rate.. the ...,neu
89996,@realDonaldTrump We already know that but you‚...,neg
89997,First coronavirus case reported in St. Joseph ...,neu
89998,"If you ate ants when you were a child, you‚Äôr...",neu


In [23]:
# convert numeric_labelled_sentiment to labeleld_sentiment
def normalize_sent(sent):
    if sent == "pos":
        return "positive"
    elif sent == "neg":
        return "negative"
    else:
        return "neutral"

df_clean['labelled_sentiment'] = df_clean['labelled_sentiment'].apply(normalize_sent)
display(df_clean)

Unnamed: 0,text,labelled_sentiment
0,Coronavirus Human Coronavirus Types CDC,neutral
1,"shehryar taseer ThatÄôs üíØ true , Corona vir...",neutral
2,"TLDR Not SARS, possibly new coronavirus. Diff...",negative
3,Disease outbreak news from the WHO Middle Eas...,neutral
4,China - Media WSJ says sources tell them myst...,neutral
...,...,...
89995,C Racing48 The flu has a 2 death rate.. the co...,neutral
89996,realDonaldTrump We already know that but youÄô...,negative
89997,First coronavirus case reported in St. Joseph ...,neutral
89998,"If you ate ants when you were a child, youÄôre...",neutral


In [24]:
# accent_regex = r'[^\x00-\x7F]'
# df_clean = df_clean[~df_clean['text'].str.contains(accent_regex, regex=True)]
# df_clean = df_clean.reset_index()
# display(df_clean)

**Load**

In [None]:
%%script True
newTable = "manual_processed_labelled_twitter_data"
# dbm.create_table(table_name=newTable, dataframe=df_clean, replace=True)

query = f"""
    SELECT * FROM "{newTable}"
"""
df = dbm.query_db(query)
display(df)

The table 'manual_processed_labelled_twitter_data' already exists. Replacing entries.


Unnamed: 0,id,text,labelled_sentiment
0,0,Coronavirus Human Coronavirus Types CDC,neutral
1,1,"shehryar taseer ThatÄôs üíØ true , Corona vir...",neutral
2,2,"TLDR Not SARS, possibly new coronavirus. Diff...",negative
3,3,Disease outbreak news from the WHO Middle Eas...,neutral
4,4,China - Media WSJ says sources tell them myst...,neutral
...,...,...,...
89995,89995,C Racing48 The flu has a 2 death rate.. the co...,neutral
89996,89996,realDonaldTrump We already know that but youÄô...,negative
89997,89997,First coronavirus case reported in St. Joseph ...,neutral
89998,89998,"If you ate ants when you were a child, youÄôre...",neutral


In [None]:
%%script True

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

newTable = "manual_labelled_sentiment_scores"


'''
VADER
'''

def classify_sentiment(compound):
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

def accuracy_metrics(df):
    # Accuracy calculation
    accuracy = accuracy_score(df['labelled_sentiment'], df['sentiment'])
    
    # Precision, recall, and f1 score for each sentiment class
    precision = precision_score(df['labelled_sentiment'], df['sentiment'], average=None, labels=['positive', 'negative', 'neutral'])
    recall = recall_score(df['labelled_sentiment'], df['sentiment'], average=None, labels=['positive', 'negative', 'neutral'])
    f1 = f1_score(df['labelled_sentiment'], df['sentiment'], average=None, labels=['positive', 'negative', 'neutral'])
    
    return accuracy, precision, recall, f1

query = f"""
    SELECT * FROM "{newTable}"
"""
df1 = dbm.query_db(query)
newTable = "manual_processed_labelled_twitter_data"

query = f"""
    SELECT * FROM "{newTable}"
"""
df2 = dbm.query_db(query)

df = df1.merge(df2, on='id', how='inner')

df['sentiment'] = df['compound'].apply(classify_sentiment)

accent_regex = r'[^\x00-\x7F]'
df = df[~df['text'].str.contains(accent_regex, regex=True)]
                                       
display(df)
df.head(100).to_csv("testing.csv")


accuracy, precision, recall, f1 = accuracy_metrics(df)

    # st.write("### Accuracy Metrics")
    # st.write("The accuracy metrics below show the performance of the Vader sentiment analysis.")
    # st.write(f"**Accuracy Positive:** {accuracy[0]:.2f}")
    # st.write(f"**Accuracy Negative:** {accuracy[1]:.2f}")
    # st.write(f"**Accuracy Neutral:** {accuracy[2]:.2f}")

print(f"**Precision Positive:** {precision[0]}")
print(f"**Precision Negative:** {precision[1]}")
print(f"**Precision Neutral:** {precision[2]}")
print("")
print(f"**Recall Positive:** {recall[0]}")
print(f"**Recall Negative:** {recall[1]}")
print(f"**Recall Neutral:** {recall[2]}")
print("")
print(f"**F1 Score Positive:** {f1[0]}")
print(f"**F1 Score Negative:** {f1[1]}")
print(f"**F1 Score Neutral:** {f1[2]}")


Unnamed: 0,id,neg,neu,pos,compound,tweet,text,labelled_sentiment,sentiment,text_blob_sentiment
0,0,0.000,1.000,0.000,0.0000,Coronavirus Human Coronavirus Types CDC,Coronavirus Human Coronavirus Types CDC,neutral,neutral,0.000000
6,6,0.000,0.777,0.223,0.3182,Virologists weigh in on novel coronavirus in C...,Virologists weigh in on novel coronavirus in C...,neutral,neutral,0.000000
9,9,0.262,0.738,0.000,-0.4939,Chinese report says mysterious illnesses may b...,Chinese report says mysterious illnesses may b...,neutral,neutral,0.045455
10,10,0.107,0.893,0.000,-0.0516,China identifies new strain of coronavirus as ...,China identifies new strain of coronavirus as ...,neutral,positive,0.136364
11,11,0.115,0.676,0.209,0.3382,I always feel weird hoping for another coronav...,I always feel weird hoping for another coronav...,negative,negative,-0.625000
...,...,...,...,...,...,...,...,...,...,...
89985,89985,0.115,0.815,0.071,-0.2944,actually im kinda ok with not seeing them bc c...,actually im kinda ok with not seeing them bc c...,neutral,neutral,0.033333
89986,89986,0.000,1.000,0.000,0.0000,This corona virus is now officially cramping m...,This corona virus is now officially cramping m...,neutral,neutral,0.000000
89988,89988,0.000,1.000,0.000,0.0000,"""Coronavirus WHO classifies COVID-19 as a pan...","""Coronavirus WHO classifies COVID-19 as a pan...",neutral,neutral,0.000000
89997,89997,0.000,1.000,0.000,0.0000,First coronavirus case reported in St. Joseph ...,First coronavirus case reported in St. Joseph ...,neutral,positive,0.116667


**Precision Positive:** 0.21447838448640374
**Precision Negative:** 0.9961201322029027
**Precision Neutral:** 0.9777547256800369

**Recall Positive:** 0.9859249329758714
**Recall Negative:** 0.9251301214466836
**Recall Neutral:** 0.7024870952604411

**F1 Score Positive:** 0.3523142326806778
**F1 Score Negative:** 0.9593135898145585
**F1 Score Neutral:** 0.817572885711991


In [12]:
# %%script True

"""
Fills manual_labelled_sentiment_scores tables
"""

from textblob import TextBlob
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.utils import resample

import pandas as pd

from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

from spacy import load

from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from database import database_manager as dbm

nlp = load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

def textblob_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

def classify_sentiment(compound):
    if compound < 0:
        return 'negative'
    elif compound == 0:
        return 'neutral'
    else:
        return 'positive'
    
def accuracy_metrics(df):
    accuracy = accuracy_score(df['labelled_sentiment'], df['sentiment'])
    
    precision = precision_score(df['labelled_sentiment'], df['sentiment'], average=None, labels=['positive', 'negative', 'neutral'])
    recall = recall_score(df['labelled_sentiment'], df['sentiment'], average=None, labels=['positive', 'negative', 'neutral'])
    f1 = f1_score(df['labelled_sentiment'], df['sentiment'], average=None, labels=['positive', 'negative', 'neutral'])
    
    return accuracy, precision, recall, f1

def balance_classes(df, target_column):
    # separate the dataset by class
    positive = df[df[target_column] == 'positive']
    negative = df[df[target_column] == 'negative']
    neutral = df[df[target_column] == 'neutral']
    
    # find the majority class size
    max_class_size = max(len(positive), len(negative), len(neutral))
    
    # oversample or undersample each class to match the majority size
    positive_balanced = resample(positive, replace=True, n_samples=max_class_size, random_state=42)
    negative_balanced = resample(negative, replace=True, n_samples=max_class_size, random_state=42)
    neutral_balanced = resample(neutral, replace=True, n_samples=max_class_size, random_state=42)
    
    df_balanced = pd.concat([positive_balanced, negative_balanced, neutral_balanced])
    return df_balanced

read_table = "manual_processed_labelled_twitter_data"

query = f"""
    SELECT * FROM "{read_table}"
"""
df = dbm.query_db(query)
display(df)



# balance the dataset before evaluation
df = balance_classes(df, 'labelled_sentiment')

# shuffle the balanced dataset to avoid ordering issues
df = df.sample(frac=1, random_state=42)
df.reset_index()
display(df)


sentiment_counts = df['labelled_sentiment'].value_counts()
positive_count = sentiment_counts.get('positive', 0)
negative_count = sentiment_counts.get('negative', 0)
neutral_count = sentiment_counts.get('neutral', 0)

print(f"Positive: {positive_count}")
print(f"Negative: {negative_count}")
print(f"Neutral: {neutral_count}")



# accent_regex = r'[^\x00-\x7F]'
# df = df[~df['text'].str.contains(accent_regex, regex=True)]
# df = df.reset_index()

df['compound'] = df['text'].apply(textblob_sentiment)
df['sentiment'] = df['compound'].apply(classify_sentiment)

display(df)
# df['tweet'] = df['text']

accuracy, precision, recall, f1 = accuracy_metrics(df)

print(f"**Overall Accuracy:** {accuracy}")

print(f"**Precision Positive:** {precision[0]}")
print(f"**Precision Negative:** {precision[1]}")
print(f"**Precision Neutral:** {precision[2]}")
print("")
print(f"**Recall Positive:** {recall[0]}")
print(f"**Recall Negative:** {recall[1]}")
print(f"**Recall Neutral:** {recall[2]}")
print("")
print(f"**F1 Score Positive:** {f1[0]}")
print(f"**F1 Score Negative:** {f1[1]}")
print(f"**F1 Score Neutral:** {f1[2]}")

df = df.drop(["id"], axis=1)
display(df)

# newTable = "manual_labelled_sentiment_scores" # Before oversampling dataset
newTable = "textblob_labelled_sentiment_scores"

# dbm.create_table(table_name=newTable, dataframe=df, replace=True)



Unnamed: 0,id,text,labelled_sentiment
0,0,Coronavirus Human Coronavirus Types CDC,neutral
1,1,"shehryar taseer ThatÄôs üíØ true , Corona vir...",neutral
2,2,"TLDR Not SARS, possibly new coronavirus. Diff...",negative
3,3,Disease outbreak news from the WHO Middle Eas...,neutral
4,4,China - Media WSJ says sources tell them myst...,neutral
...,...,...,...
89995,89995,C Racing48 The flu has a 2 death rate.. the co...,neutral
89996,89996,realDonaldTrump We already know that but youÄô...,negative
89997,89997,First coronavirus case reported in St. Joseph ...,neutral
89998,89998,"If you ate ants when you were a child, youÄôre...",neutral


Unnamed: 0,id,text,labelled_sentiment
76644,76632,My latest column ÄúTake precautions to minimi...,positive
70481,70465,Italy to lock down Milan region in bid to cont...,negative
62675,62659,Coronavirus Updates As Cases Climb and Stocks...,neutral
76052,76042,JoeyDrew10 cara doxyluv jeremynewberger Influe...,negative
9583,9573,SE Asia Stocks-Extend losses as coronavirus fe...,positive
...,...,...,...
83031,83024,mitchellvii The Signs and Symptoms of the Coro...,negative
1897,1896,Bro the coronavirus is killing people at an al...,negative
32827,32796,Scrimmzox Bro it's the funniest fucking thing ...,negative
5129,5126,The latest The Umberto Cammarata Daily! coron...,neutral


Positive: 67385
Negative: 67385
Neutral: 67385


Unnamed: 0,id,text,labelled_sentiment,compound,sentiment
76644,76632,My latest column ÄúTake precautions to minimi...,positive,0.0000,neutral
70481,70465,Italy to lock down Milan region in bid to cont...,negative,0.0000,neutral
62675,62659,Coronavirus Updates As Cases Climb and Stocks...,neutral,0.0258,positive
76052,76042,JoeyDrew10 cara doxyluv jeremynewberger Influe...,negative,0.0000,neutral
9583,9573,SE Asia Stocks-Extend losses as coronavirus fe...,positive,-0.6705,negative
...,...,...,...,...,...
83031,83024,mitchellvii The Signs and Symptoms of the Coro...,negative,0.0000,neutral
1897,1896,Bro the coronavirus is killing people at an al...,negative,-0.7096,negative
32827,32796,Scrimmzox Bro it's the funniest fucking thing ...,negative,-0.4854,negative
5129,5126,The latest The Umberto Cammarata Daily! coron...,neutral,0.0000,neutral


**Overall Accuracy:** 0.5470257970369271
**Precision Positive:** 0.5857432280333158
**Precision Negative:** 0.5370230379338976
**Precision Neutral:** 0.5146916665146735

**Recall Positive:** 0.5792238628774949
**Recall Negative:** 0.6430808043333086
**Recall Neutral:** 0.4187727238999777

**F1 Score Positive:** 0.582465303686017
**F1 Score Negative:** 0.5852861329839679
**F1 Score Neutral:** 0.4618040781592642


Unnamed: 0,text,labelled_sentiment,compound,sentiment
76644,My latest column ÄúTake precautions to minimi...,positive,0.0000,neutral
70481,Italy to lock down Milan region in bid to cont...,negative,0.0000,neutral
62675,Coronavirus Updates As Cases Climb and Stocks...,neutral,0.0258,positive
76052,JoeyDrew10 cara doxyluv jeremynewberger Influe...,negative,0.0000,neutral
9583,SE Asia Stocks-Extend losses as coronavirus fe...,positive,-0.6705,negative
...,...,...,...,...
83031,mitchellvii The Signs and Symptoms of the Coro...,negative,0.0000,neutral
1897,Bro the coronavirus is killing people at an al...,negative,-0.7096,negative
32827,Scrimmzox Bro it's the funniest fucking thing ...,negative,-0.4854,negative
5129,The latest The Umberto Cammarata Daily! coron...,neutral,0.0000,neutral


In [7]:
read_table = "text_blob_sentiment_scores_df"

query = f"""
    SELECT * FROM "{read_table}"
"""
df = dbm.query_db(query)
display(df)

Unnamed: 0,id,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,compound,tweet
0,0,0,Buziness Bytes,Noida,"Buziness Bytes is an independent news, informa...",2020-02-25,1177,884,1334,False,2021-06-30,Vaccination not done in UP today! If you are ...,"['COVID19', 'COVIDVaccination', 'covidvaccinef...",Twitter for Android,False,0.000000,Vaccination not done in UP today! If you are ...
1,1,1,Last Will,My Boiling Bedroom,these strikes are wrong at a time when negotia...,2019-01-14,6,33,256,False,2021-06-30,Just had my first dose of the CovidVaccine and...,"['CovidVaccine', 'RTX', 'antivaxxers']",Twitter for iPhone,False,-0.020833,Just had my first dose of the CovidVaccine and...
2,2,2,Pratidin Time,"Guwahati, India","is a 24-hour Assamese news channel, and is a m...",2011-12-16,125319,224,5,False,2021-06-30,INDIA No Covid Vaccine Affects Fertility Of A...,"['INDIA', 'COVID19India', 'COVIDVaccine', 'Ind...",Twitter Web App,False,0.000000,INDIA No Covid Vaccine Affects Fertility Of A...
3,3,3,⚘Cllr Anna Grainger PC Dundrum LEA 😷2 metres😷,Dundrum,Councillor for #Dundrum LEA▪Peace Commissioner...,2011-01-12,1106,1195,6903,False,2021-06-30,Delighted to become a full member of the Pfize...,"['PfizerGang', 'Pfizered', 'ShotOfHope', 'Vacc...",Twitter for Android,False,0.568750,Delighted to become a full member of the Pfize...
4,4,4,Dr.Anudeep Varma,Jadcherla,Doctor,2010-08-03,319,248,1253,False,2021-06-30,Covid Vaccination in Telangana New Guidelines ...,"['Telangana', 'Covishield', 'Covaxin', 'CovidV...",Twitter for Android,False,0.136364,Covid Vaccination in Telangana New Guidelines ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188701,188701,188701,Luchia,Canada,"Sarcasm is my middle name ( nisam lijevo, nisa...",2020-06-12,72,167,3299,False,2021-06-30,"Ok, after the longest night lets sum it up sh...","['Moderna', 'Pfizer', 'CovidVaccine', 'itsJust...",Twitter for iPhone,False,0.005556,"Ok, after the longest night lets sum it up sh..."
188702,188702,188702,ANI Multimedia,New Delhi,"ANI News - Multi-media news agency, content fo...",2009-08-28,1458,0,4,False,2021-06-30,Tripura leads in COVID-19 vaccination in 45 pl...,"['India', 'Tripura', 'COVID19', 'CovidVaccine'...",Twitter Web App,False,0.000000,Tripura leads in COVID-19 vaccination in 45 pl...
188703,188703,188703,Cllr Hitesh Tailor,"Acton, London. UK","#Ealing Labour Cllr (East Acton), GMB Union. P...",2009-03-18,2477,3785,1721,False,2021-06-30,If you've not yet had your CovidVaccine please...,"['CovidVaccine', 'Acton', 'GrabAJab']",Twitter for Android,False,0.000000,If you've not yet had your CovidVaccine please...
188704,188704,188704,Jared Roch,"Manitoba, Canada","Father, Husband, Electrician, Hockey Fan #GoJe...",2011-03-18,72,230,3936,False,2021-06-30,This Moderna shot is fuckin with me! Can't sto...,['CovidVaccine'],Twitter for Android,False,-0.800000,This Moderna shot is fuckin with me! Can't sto...
