In [1]:
# importing the dependencies
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pickle
import sklearn
from sklearn import linear_model

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import pandas as pd

sentiment_intensity_analyzer = SentimentIntensityAnalyzer() 
def get_vader_sentiment(sentence, threshold=0.1): 
    if type(sentence) is not str:
        return "null"
     
    sentiment_dict = sentiment_intensity_analyzer.polarity_scores(sentence) 
   
    if sentiment_dict['compound'] >= threshold: 
        return "positive" 
  
    elif sentiment_dict['compound'] <= -threshold: 
        return "negative"
  
    else: 
        return "neutral" 

In [3]:
from afinn import Afinn
afn = Afinn(emoticons=True)

def get_afn_sentiment(sentence, threshold=1.0): 
    if type(sentence) is not str:
        return "null"
    
    score = afn.score(sentence)
    
    if score >= threshold: 
        return "positive" 
  
    elif score <= -threshold: 
        return "negative"
  
    else: 
        return "neutral" 
    

In [4]:
def convert_label_to_int(label):
    if label is "positive":
       return 1
    elif label is "negative":
        return -1
    else:
        return 0

# Combines vader, afinn and text blob
def get_combined_label(text_blob_label, vader_label, afn_label):
    text_blob_score = convert_label_to_int(text_blob_label)
    vader_score = convert_label_to_int(vader_label)
    afn_score = convert_label_to_int(afn_label)
    # 20 percent weightage to text blob, 40% to vader, 40% to afn
    net_score = 0.2*text_blob_score + 0.4*vader_score + 0.4*afn_score
    if net_score > 0.3:
        return "positive"
    elif net_score < -0.3:
        return "negative"
    else:
        return "neutral"

In [5]:
# adds all columns to file
def add_all_columns(input_file, output_file):
    data = pd.read_csv(input_file)
    data['afn_sentiment'] = data['text_clean'].apply(lambda text: get_afn_sentiment(text, 0.1))
    data['vader_sentiment'] = data['text_clean'].apply(lambda text: get_vader_sentiment(text, 0.1))
    data['ensemble_sentiment'] = data.apply(lambda row: 
                                        get_combined_label(row.sentiment, 
                                                           get_afn_sentiment(row.text_clean), 
                                                           get_vader_sentiment(row.text_clean)),
                                        axis = 1)
    data.to_csv(output_file, index=False)

In [6]:
# add_all_columns('2020-01.csv', '2020-01-final.csv')
# add_all_columns('2020-02.csv', '2020-02-final.csv')
# add_all_columns('2020-03.csv', '2020-03-final.csv')
# add_all_columns('2020-04.csv', '2020-04-final.csv')
# add_all_columns('2020-05.csv', '2020-05-final.csv')

In [7]:
# Groundtruth accuracy calculations
input_str = '''covid kingcounty, washington state reports second coronavirus death, announcing first positive case covid new york city already identified close contacts individual may exposed take appropriate measures prevent spread covid, people get coronavirus whole world wants wear surgical mask million people aids still nobody wants wear condom, theres drive thru covid test north austin domain brockton drive austin tx fill online questionaire first getting tested, footage shows police new orleans attempting clear bourbon street louisianas governor signed proclamation banning gatherings people amid coronavirus outbreak, time bring home manufacturing drug ingredients coming china fda says shortages begun, repswalwell protip never listen biff democrat anything ever meet deborah l birx american physician diplomat coronavirus response coordinator white house birx served ambassadoratlarge united states global aids coordinator since, trump waited months corona virus became pandemic amp us deaths occurred declare public health emergency media would crash stock market hysteria obama it good, onairwill shirt taking advantage global pandemic profit family business immoral, cant wait white girls start lookin like white girls cause epidemic cycle kardashians steal black women white women want copy them call white people amp wonder wrong cause minds copied kardashians, really unseemly bernie praising communist china middle global pandemic started, chinas lockdown measures minimize coronavirus infections created one unexpected benefit dramatic improvement nations air quality, trump throws tanturm calls outlets report coronavirus fake news via politicususa, downtown nashville brought knees covid joke emergency physicians critical condition fighting lives exposed sure get drank on janna stupidity cost people lives covid, gallup poll numbers handling situation outstanding best thank you, one candidate obvious dementia one candidate heart disease debating coronavirus news network good times, kzhowell lab corp country cdc trying profit using central lab people going go jail, help us schools communities serve maintain continuity learning event prolonged school closures due coronavirus discovery education created threepronged response read, know president big job find increasingly weird much debate dedicated things pandemic could possibly one convulsive things happen country since last world war, gallup poll numbers handling situation outstanding best thank you, sleepy joe biden also said guns killed million americans last year wants win georgia super tuesday not up got speaking location wrong again, best set info ive found corona virus worth watching full mins, question citizens get together amp file massive class action lawsuits fox news lying amp gaslighting americans something abt this surely god legal pandemic serious need truth, erinmperrine realdonaldtrump administration took unprecedented early steps response coronavirus believe dems false apocalyptic spin try politicize virus sick politics brandonbeckham christian voice, stateowned newspaper iran says masoumeh ebtekar vice president islamic republic spokeswoman hostagetakers new coronavirus state media also reported iranian cleric hadi khosroshahi died coronavirus qom, new cdc recommends people cancel postpone inperson events consist people united states weeks, facts immigrants different countries affected china virus apprehended deemed inadmissible border includes china nearly people infected border security health security rt close border, adamserwer wonder cdc whistleblowers couple issues completely headscratching hope attributed staff budget cuts rather actual malice, arent blacks dying too yelled abyss, joe biden said cant deal pandemic changing healthcare system sir may beg differ, first positive coronavirus case confirmed new york state, orang yang pergi travel musim covid ni sumpah selfish gila, infectiousdz im imagining people getting planes wearing ns contaminating hands pushing order drink eat, cutenshsjsjfhdkkdk, oakland schools closing due coronavirus steph ayesha curry looking help donate million meals kids rely education system eat via eatlearnplay, needless say stand reporting white house google, punish trump november callin coronavirus hoax people dyin, sleepy joe biden also said guns killed million americans last year wants win georgia super tuesday not up got speaking location wrong again, nd coronavirususa death washington dont feel safer get info twitter news sources trump admin completely incompetent handle, yes serious want part insanity obsessive paranoid panic stresses actual virus calm smart actions covid, corona came becoming gym babe, people psychopaths unfit role public life, virginia dept health willing volunteer support covid response needed please register become virginia medical reserve corps volunteer email vamrcvdhvirginiagov, cdcgov issued new guidance recommending next weeks organizers cancel postpone inperson events consisting people more read cdcs latest covid guidance, step medical bill testing positive corona im, american scientists isnt coronavirus affecting africa africa, need comprehensive response coronavirus outbreak plan mounts decisive public health response curb spread disease provide treatment need decisive economic response delivers real relief, corona virus ni vergas, president trump announced today extend european travel ban include uk ireland part continuing efforts combat coronavirus nextrevfnc stevehiltonx, coronavirus got stressed out monterey aquarium closed public live streaming instead see birds sharks otters jellyfish penguins turtles plenty, dog hong kong tests positive coronavirus confirms, clarify isolation tested positive virus quarantine contact someone tested positive virus amp waiting see develop symptoms social distancing something everyone reduce risk transmission, wrong again trump misidentifies first coronavirus victim woman white house press conference via occupydemocrats, staying home due social distancing job shutting means money might issue reply venmo handle youre someone blessed ok currently consider helping folks reply here friendly texan thing do, watch this shows right thing stay home fullest extent possible us help slow spread virus protecting elderly vulnerable other, great idea businesses gov agencies serve public brainstorm find new ways serve vulnerable proactively implement social distancing practices customers, good advice nyc everywhere stay home stay home wash hands, trump tells us relax were great virus expert says worst yet come, coronavirus spreading concern growing economy taken hit tonight im addressing nation donald trump yet failed lead ive led crises before president ill trust science let experts jobs, speakerpelosi hes late late anemic hopefully make loss time speaker pelosi knocked pres trumps inadequate response spread coronavirus wtpteam onevoice, , realdonaldtrump lie airports chaos tests need them resign thrown hell office today trumpistheworstpresidentever trumpplague covid, coronavirus houston methodist hospital media wont say bc rodeo lose money joke, coronavirus donations btstwt fans reached almost million us according korea disaster relief association since suga whose hometown affected daegu donated million last month donations pouring in association said, birdieglad pannlewis graceslick gdijkhuyzen haldonahue veteransi daogtriple dltrunnell robinvolpi jhwilsh flints asoldiersvoice shawbear mahogany henleycarol mzdivah kingivan badshoehabit rk jamsride, today learned cdc mistakenly released patient texas center infectious disease later returned positive covid reading fact cdc allowed public exposed patient positive covid reading unacceptable full, protect ya neck coronavirus making thousand prints distributing across new york city feel free city share rt world wutang, new yorks first confirmed coronavirus case woman recently traveled iran gov cuomo said isolation manhattan home, san antonio mayor cdc released patient later tested positive covid, bernie sanders responded justification cubas human rights abuses justifying chinas human rights abuses yes really hes bad this demdebate, aint niggas dying didnt right, know actors much smarter us bs really care anyones opinion paying for, breaking second person died coronavirus covid seattle area five others critical condition, vincentcrypt lady gaga too john legend chrissy teigen people epstein flight log means corona connected pedofile wion, equinox refused close inventible happened member infected covid came gym almost certainly spreading virus, hoes mad, heres whats gonna happen coronavirus us testing cases starting now find hundreds cases suddenly testing backlog media use political stick amplifying panic happens, beauxhandsome wild part came china too, corona virus made new york stop worrying move back home months, we well control we pretty much shut down the numbers going get progressively better were going substantially down up one day like miracle disappear none true, jrehling world health organization estimates spanish flu death rate among infected people thats true corona could potentially tragedybut medical practices improved last years lets keep ol fingers crossed, breaking st clinical trial begin evaluating possible coronavirus vaccine could take year develop successful official says, nancy pelosi lot nerve wants lecture trump coronaviruswhile ignoring cesspool third world disease drugaddled homeless tent cities public spaces littered used needles human feces district pelosi utter fraud, us running food nations biggest retailers say food supply chain remains intact ramping meet unprecedented stockpiling brought coronavirus pandemic, , great idea businesses gov agencies serve public brainstorm find new ways serve vulnerable proactively implement social distancing practices customers, never know slapshot covid, statement chair petitioning covid virus, mississippi psc temporarily suspends disconnection utility services response covid coronavirus danemaxwellms brandonpresley brentgbailey, coronavirus ya es una pandemia opsoms qu es una pandemia, ah yes next game looking great residentevil coronapocalypse coronavirusupdates, who cdcgov bnodesk elonmusk mar mainland china exhubei coronavirus vs international week shift, coronavirus deaths china italy iran spain france korea us uk japan netherlands switzerland germany philippines iraq australia indonesia san marino belgium greece hong kong, disastrous communications anthrax attacks cdc wrote page manual communicate public health crises trump breaking every rule playbook via carolynyjohnson thewanreport, coronavirus updatemeeting mayor excused restroom forgot turn microphone dead, breaking second person died coronavirus covid seattle area five others critical condition, florida, coronavirus closing schools china month ago teacher thick passes best practices online learning, norway developed socialist country called students home usa citing poorly developed infrastructure say gop handling coronavirus shameful coronapocolypse demcast'''
inputs = input_str.split(',')
expected_op = ['neutral','negative','positive','negative','positive','negative','negative','negative','negative','negative','negative','negative','positive','neutral','negative','positive','positive','negative','neutral','negative','positive','negative','positive','negative','positive','negative','negative','negative','neutral','negative','neutral','negative','neutral','neutral','neutral','negative','neutral','negative','negative','negative','negative','positive','negative','positive','neutral','positive','positive','neutral','neutral','negative','neutral','negative','negative','negative','positive','positive','positive','positive','neutral','neutral','neutral','neutral','negative','negative','neutral','neutral','negative','neutral','negative','negative','negative','negative','neutral','negative','neutral','negative','negative','negative','negative','neutral','positive','positive','positive','negative','positive','neutral','positive','neutral','negative','negative','neutral','neutral','positive','negative','negative','neutral','negative','neutral','positive','negative']

In [8]:
def get_vader_predictions_for_ground_truth():
    outputs = []
    for inp in inputs:
        outputs.append(get_vader_sentiment(inp, 0.05))
    return outputs

def get_afn_predictions_for_ground_truth():
    outputs = []
    for inp in inputs:
        outputs.append(get_afn_sentiment(inp))
    return outputs

        
def get_combined_predictions_for_groundtruth():
    outputs = []
    for index in range(len(inputs)):
        inp = inputs[index]
        outputs.append(get_combined_label(text_blob_predictions[index], get_vader_sentiment(inp), get_afn_sentiment(inp)))
        
    return outputs

def get_accuracy(x):
    sum = 0
    for index in range(len(x)):
        if(x[index] == expected_op[index]):
            sum+=1
    print(sum/len(x)*100)

text_blob_predictions = ['neutral','neutral','positive','positive','positive','positive','neutral','neutral','positive','neutral','negative','positive','negative','negative','negative','positive','positive','negative','negative','negative','positive','positive','positive','positive','negative','positive','positive','positive','positive','neutral','neutral','positive','negative','neutral','neutral','negative','negative','neutral','positive','negative','positive','positive','neutral','positive','positive','positive','neutral','positive','neutral','neutral','positive','positive','positive','negative','positive','negative','positive','positive','negative','negative','negative','neutral','neutral','neutral','neutral','neutral','positive','positive','positive','positive','negative','positive','positive','neutral','neutral','positive','negative','neutral','positive','positive','positive','positive','positive','positive','positive','neutral','positive','neutral','neutral','neutral','neutral','positive','neutral','neutral','negative','negative','neutral','neutral','positive','positive']
afn_predictions = get_afn_predictions_for_ground_truth()
vader_predictions = get_vader_predictions_for_ground_truth()
ensemble_predictions = get_combined_predictions_for_groundtruth()

In [9]:
get_accuracy(afn_predictions)

57.99999999999999


In [10]:
len(text_blob_predictions)

100

In [11]:
get_accuracy(text_blob_predictions)

41.0


In [12]:
get_accuracy(vader_predictions)

55.00000000000001


In [13]:
get_accuracy(ensemble_predictions)

54.0


In [14]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import numpy as np

In [15]:
def get_metrics(expected, predictions, model):
    # metrics by class
    print("Metrics for ", model)
    accuracy = accuracy_score(expected, predictions)
    precision = precision_score(expected, predictions, average=None)
    recall = recall_score(expected, predictions, average=None)
    f1 = f1_score(expected, predictions, average=None)
    print("accuracy: ", np.round(accuracy,2))
    print("precision: ", np.round(precision,2))
    print("recall: ", np.round(recall,2))
    print("f1_score: ", np.round(f1,2))
    
def get_average_metrics(expected, predictions, model):
    # metrics by class
    print("Metrics for ", model)
    accuracy = accuracy_score(expected, predictions)
    precision = precision_score(expected, predictions, average=None)
    recall = recall_score(expected, predictions, average=Micro)
    f1 = f1_score(expected, predictions, average=micro)
    print("accuracy: ", np.round(accuracy,2))
    print("precision: ", np.round(precision,2))
    print("recall: ", np.round(recall,2))
    print("f1_score: ", np.round(f1,2))

In [16]:
get_metrics(expected_op, afn_predictions, "afinn")

Metrics for  afinn
accuracy:  0.58
precision:  [0.72 0.54 0.46]
recall:  [0.58 0.48 0.7 ]
f1_score:  [0.64 0.51 0.55]


In [17]:
get_metrics(expected_op, vader_predictions, "vader")

Metrics for  vader
accuracy:  0.55
precision:  [0.69 0.5  0.46]
recall:  [0.5  0.45 0.78]
f1_score:  [0.58 0.47 0.58]


In [18]:
get_metrics(expected_op, text_blob_predictions, "text_blob")

Metrics for  text_blob
accuracy:  0.41
precision:  [0.48 0.42 0.38]
recall:  [0.21 0.45 0.78]
f1_score:  [0.29 0.43 0.51]


In [19]:
get_metrics(expected_op, ensemble_predictions, "ensemble")

Metrics for  ensemble
accuracy:  0.54
precision:  [0.67 0.46 0.47]
recall:  [0.5  0.45 0.74]
f1_score:  [0.57 0.46 0.58]


## Weighted, Macro, Micro F1_score

In [20]:
df_senti = pd.read_csv('src/Groundtruth-temp.csv')
sentistrength_predictions = list(str(i).lower() for i in df_senti['Sentiment'])

In [21]:
predition_dic = {
    'afinn': afn_predictions,
    'vader': vader_predictions,
    'text_blob': text_blob_predictions,
    'sentistrength': sentistrength_predictions,
    'ensemble': ensemble_predictions,
}
ml_models = list(predition_dic.keys())
measures = ['Accuracy', 'Weighted-F1', 'Macro-F1', 'Micro-F1']
accuracy_lst = dict()
f1_weighted_lst = dict()
f1_micro_lst = dict()
f1_macro_lst = dict()


In [22]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
y_true = expected_op
for name, y_pred in predition_dic.items():
    accuracy_lst[name] = float(str(round(accuracy_score(y_true, y_pred), 6)))
    f1_weighted_lst[name] = float(str(round(f1_score(y_true, y_pred, average='macro'), 6)))
    f1_micro_lst[name] = float(str(round(f1_score(y_true, y_pred, average='micro'), 6)))
    f1_macro_lst[name] = float(str(round(f1_score(y_true, y_pred, average='weighted'), 6)))


In [23]:
measures_comparison = {
    'ML Models': ml_models,
    'Accuracy': list(accuracy_lst.values()),
    'Weighted-F1': list(f1_weighted_lst.values()),
    'Macro-F1': list(f1_micro_lst.values()), 
    'Micro-F1': list(f1_macro_lst.values())
}
measures_comparison

{'ML Models': ['afinn', 'vader', 'text_blob', 'sentistrength', 'ensemble'],
 'Accuracy': [0.58, 0.55, 0.41, 0.46, 0.54],
 'Weighted-F1': [0.568164, 0.543895, 0.410077, 0.278537, 0.534613],
 'Macro-F1': [0.58, 0.55, 0.41, 0.46, 0.54],
 'Micro-F1': [0.583498, 0.54823, 0.381417, 0.451062, 0.539109]}

In [24]:
df_results = pd.DataFrame(data=measures_comparison)
# df_results
df_results.style.background_gradient(cmap='Blues')

Unnamed: 0,ML Models,Accuracy,Weighted-F1,Macro-F1,Micro-F1
0,afinn,0.58,0.568164,0.58,0.583498
1,vader,0.55,0.543895,0.55,0.54823
2,text_blob,0.41,0.410077,0.41,0.381417
3,sentistrength,0.46,0.278537,0.46,0.451062
4,ensemble,0.54,0.534613,0.54,0.539109
