# Import all Necessary Modules

In [1]:
import numpy as np 
import pandas as pd
import random
import os

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.pipeline.textcat import Config, single_label_cnn_config, single_label_bow_config, single_label_default_config
from spacy.training.example import Example
from spacy.util import minibatch, compounding

# Train Data

In [4]:
dfTrain = pd.read_csv('train.csv')
dfTrain.columns=['number', 'body', 'TB_Subjectivity', 'TB_Polarity']
dfTrain = dfTrain.drop(columns=["number", "TB_Subjectivity"])
dfTrain = dfTrain.dropna().reset_index(drop=True)
numberOfRows = dfTrain.shape[0]
dfTrain

Unnamed: 0,body,TB_Polarity
0,"I'm yet to see that one, but I've also heard p...",0.0
1,Scary Stories to tell in the Dark. Was better ...,2.0
2,"I havent seen either, but _Joker_, _Doctor Sle...",2.0
3,The shire theme (concerning hobbits) always re...,2.0
4,Birds of Prey (and the Fantabulous Emancipatio...,1.0
...,...,...
12121,Even in his Netflix shit like Sweet Girl and t...,2.0
12122,"""I want to save my family and if i have to sav...",1.0
12123,You're telling me C-movie tier The Ice Road is...,1.0
12124,The only film I saw on the list that I really ...,1.0


In [5]:
for i in range(numberOfRows): 
    if dfTrain['TB_Polarity'][i] == 0.0:
        dfTrain['TB_Polarity'][i] = "negative"
    elif dfTrain['TB_Polarity'][i] == 1.0:
        dfTrain['TB_Polarity'][i] = "positive"
    elif dfTrain['TB_Polarity'][i] == 2.0:
        dfTrain['TB_Polarity'][i] = "neutral"

dfTrain

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTrain['TB_Polarity'][i] = "negative"


Unnamed: 0,body,TB_Polarity
0,"I'm yet to see that one, but I've also heard p...",negative
1,Scary Stories to tell in the Dark. Was better ...,neutral
2,"I havent seen either, but _Joker_, _Doctor Sle...",neutral
3,The shire theme (concerning hobbits) always re...,neutral
4,Birds of Prey (and the Fantabulous Emancipatio...,positive
...,...,...
12121,Even in his Netflix shit like Sweet Girl and t...,neutral
12122,"""I want to save my family and if i have to sav...",positive
12123,You're telling me C-movie tier The Ice Road is...,positive
12124,The only film I saw on the list that I really ...,positive


In [6]:
for i in range(numberOfRows): 
    doc = nlp(dfTrain['body'][i])
    filtered_tokens = [token for token in doc if not token.is_stop and not token.is_punct]
    newStr = ' '.join(map(str, filtered_tokens))
    dfTrain['body'][i] = newStr

dfTrain

Unnamed: 0,body,TB_Polarity
0,heard people cite Midnight Switchgrass Death w...,negative
1,Scary Stories tell Dark better expected creatu...,neutral
2,nt seen Joker Doctor Sleep Midsommar Angry Bir...,neutral
3,shire theme concerning hobbits reminds home es...,neutral
4,Birds Prey Fantabulous Emancipation Harley Qui...,positive
...,...,...
12121,Netflix shit like Sweet Girl trailer Slumberla...,neutral
12122,want save family save world damn sure Chris Pr...,positive
12123,telling C movie tier Ice Road 10,positive
12124,film saw list issue Rhythm Section believe gre...,positive


# Training Model

In [7]:
train_texts = dfTrain['body'].values
train_labels = [{'cats': {'positive': label == 'positive',
                          'negative': label == 'negative',
                          'neutral': label == 'neutral'}}
                for label in dfTrain['TB_Polarity']]

In [8]:
train_data = list(zip(train_texts, train_labels))
len(train_data)

12126

In [9]:
config = Config().from_str(single_label_bow_config)
text_cat = nlp.add_pipe('textcat', config=config, last=True)
text_cat.add_label("positive")
text_cat.add_label("negative")
text_cat.add_label("neutral")

1

In [10]:
losses = {}
for epoch in range(25):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 15
    batches = minibatch(train_data, size=15)
    # Iterate through minibatches
    for batch in batches:
        texts, annotations= zip(*batch)
            
        example = []
        # Update the model with iterating each text
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
            
        # Update the model
        nlp.update(example, drop=0.5, losses=losses)

    print(losses)

{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 217.95800463855267}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 412.3630296215415}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 598.0580041259527}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 779.3458467535675}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 957.7067228369415}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1133.965732768178}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1309.0211308375}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1482.9057497121394}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1655.8858940526843}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1828.228973497171}
{'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0, 'textcat': 1999.9079395907465}
{'tok2vec': 0.0, 't

Saving Model to Disk

In [11]:
to_dir = os.path.dirname('/Users/nikitabachhas/Desktop/AY 2022:2023 SEM 2/CZ4034- Information Retrieval/Group Project/Classification/sentiment_model')
nlp.to_disk(to_dir)
text_nlp = spacy.load(to_dir)

# Test Data

In [12]:
dfTest = pd.read_csv('test.csv')
dfTest.columns=['number', 'body', 'subjectivity', 'polarity', 'TB_Subjectivity', 'TB_Polarity']
dfTest = dfTest.drop(columns=['number', 'subjectivity', 'polarity', 'TB_Subjectivity'])
dfTest = dfTest.dropna().reset_index(drop=True)
numberOfRowsTestDF = dfTest.shape[0]
dfTest

Unnamed: 0,body,TB_Polarity
0,Was there stir around this? I didnt know it ex...,0.0
1,Whatever you do dont watch dangerous lies that...,0.0
2,Seriously what happened there. Ragnarok was su...,0.0
3,Very confusing movie. But I have a feeling the...,0.0
4,He was terrible in Greyhound &amp; Finch too. ...,0.0
...,...,...
1513,I find his music kind of lame thats all lol. ...,2.0
1514,I mean it was announced as a first for Velma w...,2.0
1515,There's a Netflix movie called nightbooks whic...,1.0
1516,Emma Seligman is a wunderkind filmmaker who ma...,2.0


In [13]:
for i in range(numberOfRowsTestDF): 
    if dfTest['TB_Polarity'][i] == 0.0:
        dfTest['TB_Polarity'][i] = "negative"
    elif dfTest['TB_Polarity'][i] == 1.0:
        dfTest['TB_Polarity'][i] = "positive"
    elif dfTest['TB_Polarity'][i] == 2.0:
        dfTest['TB_Polarity'][i] = "neutral"

dfTest

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTest['TB_Polarity'][i] = "negative"


Unnamed: 0,body,TB_Polarity
0,Was there stir around this? I didnt know it ex...,negative
1,Whatever you do dont watch dangerous lies that...,negative
2,Seriously what happened there. Ragnarok was su...,negative
3,Very confusing movie. But I have a feeling the...,negative
4,He was terrible in Greyhound &amp; Finch too. ...,negative
...,...,...
1513,I find his music kind of lame thats all lol. ...,neutral
1514,I mean it was announced as a first for Velma w...,neutral
1515,There's a Netflix movie called nightbooks whic...,positive
1516,Emma Seligman is a wunderkind filmmaker who ma...,neutral


# Running Model on Test Data

In [14]:
test_texts = list(dfTest['body'].values)

In [15]:
modelResult = []

for i in range(numberOfRowsTestDF): 
    test_texts = dfTest['body'][i]
    doc = text_nlp(test_texts)
    positiveScore = doc.cats['positive']
    negativeScore = -abs(doc.cats['negative'])
    averageScore = positiveScore + negativeScore
    if averageScore < 0.2 and averageScore > -0.2:
        modelResult.append('neutral')
    elif averageScore < 0:
        modelResult.append('negative')
    elif averageScore > 0:
        modelResult.append('positive')
    

In [16]:
dfTest['modelResult'] = modelResult
dfTest


Unnamed: 0,body,TB_Polarity,modelResult
0,Was there stir around this? I didnt know it ex...,negative,positive
1,Whatever you do dont watch dangerous lies that...,negative,negative
2,Seriously what happened there. Ragnarok was su...,negative,neutral
3,Very confusing movie. But I have a feeling the...,negative,negative
4,He was terrible in Greyhound &amp; Finch too. ...,negative,negative
...,...,...,...
1513,I find his music kind of lame thats all lol. ...,neutral,positive
1514,I mean it was announced as a first for Velma w...,neutral,positive
1515,There's a Netflix movie called nightbooks whic...,positive,positive
1516,Emma Seligman is a wunderkind filmmaker who ma...,neutral,negative


# Evaluation Results

In [17]:
truePositives = 0
trueNegatives = 0
falsePositives = 0
falseNegatives = 0
trueNeutrals = 0
falseNeutrals = 0

for i in range(numberOfRowsTestDF):
    if dfTest['TB_Polarity'][i] == 'positive' and dfTest['modelResult'][i] == 'positive':
        truePositives += 1
    elif dfTest['TB_Polarity'][i] == 'positive' and dfTest['modelResult'][i] == 'negative':
        falsePositives += 1
    elif dfTest['TB_Polarity'][i] == 'positive' and dfTest['modelResult'][i] == 'neutral':
        falsePositives += 1

    elif dfTest['TB_Polarity'][i] == 'negative' and dfTest['modelResult'][i] == 'negative':
        trueNegatives += 1
    elif dfTest['TB_Polarity'][i] == 'negative' and dfTest['modelResult'][i] == 'positive':
        falseNegatives += 1
    elif dfTest['TB_Polarity'][i] == 'negative' and dfTest['modelResult'][i] == 'neutral':
        falseNegatives += 1
    
    elif dfTest['TB_Polarity'][i] == 'neutral' and dfTest['modelResult'][i] == 'neutral':
        trueNeutrals += 1
    elif dfTest['TB_Polarity'][i] == 'neutral' and dfTest['modelResult'][i] == 'negative':
        falseNeutrals += 1
    elif dfTest['TB_Polarity'][i] == 'neutral' and dfTest['modelResult'][i] == 'positive':
        falseNeutrals += 1

print(truePositives, trueNegatives, falsePositives, falseNegatives, trueNeutrals, falseNeutrals)

633 363 77 130 25 290


In [18]:
#Precision (true positives / predicted positives) = TP / TP + FP
Precision = truePositives = truePositives/(truePositives + falsePositives)
print('Precision: ', Precision)

#Misclassification (all incorrect / all) = FP + FN / TP + TN + FP + FN
Misclassification = (falseNegatives + falsePositives + falseNeutrals)/(trueNegatives + truePositives + trueNeutrals + falseNeutrals + falseNegatives + falsePositives)
print('Misclassification: ', Misclassification)

#Sensitivity: Sensitivity aka Recall (true positives / all actual positives) = TP / TP + FN
Sensitivity = truePositives/(truePositives + falseNegatives + falseNeutrals)
print('Sensitivity: ', Sensitivity)

#Specificity: (true negatives / all actual negatives) =TN / TN + FP
Specificity = trueNegatives/ (trueNegatives + falsePositives)
print('Specificity: ', Specificity)

#Accuracy (all correct / all) = TP + TN / TP + TN + FP + FN
accuracyScore = (truePositives + trueNegatives + trueNeutrals)/ (truePositives + trueNegatives + trueNeutrals + falsePositives + falseNegatives + falseNeutrals)
print('Accuracy Score: ', accuracyScore)

#F1 Score: 2*((precision*recall)/(precision+recall))
f1Score = 2*((Precision*Sensitivity)/(Precision+Sensitivity))
print('F1 Score: ', f1Score)

Precision:  0.8915492957746479
Misclassification:  0.56101675244005
Sensitivity:  0.0021182399534187993
Specificity:  0.825
Accuracy Score:  0.43898324755994994
F1 Score:  0.004226438274304524
