## Imports

In [85]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
import pandas as pd
from bigram import BigramLM
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from utils import emotion_scores
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Model Setup

In [76]:
corpus_file_path = '../data/corpus.txt'

with open(corpus_file_path, 'r') as file:
    lines = file.readlines()

modified_lines = ['$ ' + line.strip() for line in lines]
joined_string = '\n'.join(modified_lines)
corpus = joined_string

In [77]:
model = BigramLM(corpus)
model.set_tokens()
model.calculate_bigrams()
model.set_laplace_matrix()
model.set_kn_matrix(d=0.75)

Model initialized 🟢
Tokens set 🟢


Populating bigram matrix...: 100%|██████████| 48931/48931 [00:00<00:00, 450259.63it/s]


All matrices calculated 🟢
Calculating Kneser-Ney matrix...
Kneser-Ney matrix calculated 🟢


## Generating Emotion for all non-zero Bigrams (DO NOT RUN)

In [80]:
tokens = model.get_tokens()
count_matrix = model.get_count_matrix()

In [29]:
emotions = {}
for i, token in tqdm(enumerate(tokens)):
    for j, token2 in enumerate(tokens):
        if count_matrix[i][j] > 0 and token!='$' and token2!="$":
            emotions[(token, token2)] = emotion_scores(str(token + " " + token2))
        if count_matrix[i][j] > 0 and token=='$' and token2!="$":
            emotions[(token, token2)]= emotion_scores(str(token2))
        if count_matrix[i][j] > 0 and token!='$' and token2=="$":
            emotions[(token, token2)]= emotion_scores(str(token))

with open('emotions.pkl', 'wb') as f:
    pickle.dump(emotions, f)

5430it [12:24,  7.29it/s]


## Generating 50 Sentences each of every emotion


In [78]:
emotions = ["sadness", "joy", "love", "anger", "fear", "surprise"]
for i in emotions:
    model.generate_sentences(model.get_kn_matrix(), emotion=i, word_limit=20, no_of_sentences=50, alpha = 0.25, beta = 1)

Generating sadness matrix: 5430it [00:18, 289.76it/s]
Generating sentences: 100%|██████████| 50/50 [00:02<00:00, 22.66it/s]


Sentences generated and stored 🟢


Generating joy matrix: 5430it [00:21, 251.69it/s]
Generating sentences: 100%|██████████| 50/50 [00:02<00:00, 21.46it/s]


Sentences generated and stored 🟢


Generating love matrix: 5430it [00:19, 275.43it/s]
Generating sentences: 100%|██████████| 50/50 [00:04<00:00, 10.93it/s]


Sentences generated and stored 🟢


Generating anger matrix: 5430it [00:23, 229.67it/s]
Generating sentences: 100%|██████████| 50/50 [00:02<00:00, 22.31it/s]


Sentences generated and stored 🟢


Generating fear matrix: 5430it [00:22, 245.67it/s]
Generating sentences: 100%|██████████| 50/50 [00:03<00:00, 16.02it/s]


Sentences generated and stored 🟢


Generating surprise matrix: 5430it [00:22, 236.22it/s]
Generating sentences: 100%|██████████| 50/50 [00:02<00:00, 22.20it/s]

Sentences generated and stored 🟢





## Training SVC Classifier

In [79]:
corpus_file = "..\data\corpus.txt"
labels_file = "..\data\labels.txt"

with open(corpus_file) as f:
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(f)

with open(labels_file) as l:
    y = l.read().splitlines()

In [80]:
parameters = {'kernel': ('poly', 'rbf'), 'C': [2, 3], 'gamma': ['auto', 'scale'], "degree": [2, 3]}
clf = GridSearchCV(SVC(), parameters, n_jobs=4, refit=True)
clf.fit(X, y)
best_model=clf.best_estimator_
print("best parameters:", clf.best_params_)
print("best score:", clf.best_score_)

best parameters: {'C': 2, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
best score: 0.70875


In [81]:
print(best_model.score(X, y))

0.9991666666666666


In [83]:
emotions = ["sadness", "joy", "love", "anger", "fear", "surprise"]
dataset = {}
for emotion in emotions:
    path = ".\emotion_text\gen_" + emotion + ".txt"
    with open(path, 'r') as file:
        dataset[emotion] = file.read().splitlines()

In [84]:
def emotion_accuracy(emotion, dataset, best_model):
    X = vectorizer.transform(dataset[emotion])
    y = [emotion]*len(dataset[emotion])
    y_pred = best_model.predict(X)
    tp = sum([1 for i in range(len(y)) if y[i] == y_pred[i] and y[i] == emotion])
    tn = sum([1 for i in range(len(y)) if y[i] == y_pred[i] and y[i] != emotion])
    fp = sum([1 for i in range(len(y)) if y[i] != y_pred[i] and y[i] != emotion])
    fn = sum([1 for i in range(len(y)) if y[i] != y_pred[i] and y[i] == emotion])

    return best_model.score(X, y), tp, tn, fp, fn

Tp = 0
Tn = 0
Fp = 0
Fn = 0
accuracies = []

for emotion in emotions:
    emotion_acc, tp, tn, fp, fn = emotion_accuracy(emotion, dataset, best_model)
    print(f"{emotion}: {emotion_acc}")
    accuracies.append(emotion_acc)
    Tp += tp; Tn += tn; Fp += fp; Fn += fn

print("Average accuracy: ", np.mean(accuracies))

precision = Tp/(Tp+Fp)
recall = Tp/(Tp+Fn)
f1 = 2*precision*recall/(precision+recall)
print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")

sadness: 0.92
joy: 0.66
love: 0.94
anger: 0.44
fear: 0.74
surprise: 0.98
Average accuracy:  0.7799999999999999
Precision: 1.0, Recall: 0.78, F1: 0.8764044943820225


# Top 5 Bigrams

In [19]:
matrix1=model.get_normal_matrix()
matrix1=matrix1[1:,1:]
matrix2=model.get_laplace_matrix()
matrix2=matrix2[1:,1:]
matrix3=model.get_kn_matrix()
matrix3=matrix3[1:,1:]

In [20]:
count=model.get_count_matrix()
count=count[1:,1:]
list=[]
for i in range(5429):
    for j in range(5429):
        if matrix1[i,j]==1:
            list.append((model.get_tokens()[i+1],model.get_tokens()[j+1],count[i,j],matrix1[i,j],matrix2[i,j],matrix3[i,j]))

In [21]:
sorted_list = sorted(list, key=lambda x: x[2],reverse=True)
print(sorted_list)

[('href', 'http', 25.0, 1.0, 0.004766269477543538, 0.9700023363576185), ('didn', 't', 18.0, 1.0, 0.0034875183553597653, 0.9583657827447011), ('sort', 'of', 17.0, 1.0, 0.003304571323664402, 0.9565093900961343), ('doesn', 't', 9.0, 1.0, 0.0018385732671446957, 0.9167315654894019), ('supposed', 'to', 9.0, 1.0, 0.0018385732671446957, 0.9183832405280168), ('amount', 'of', 8.0, 1.0, 0.0016550202280250091, 0.9075824539542853), ('wasn', 't', 8.0, 1.0, 0.0016550202280250091, 0.9063230111755772), ('woke', 'up', 7.0, 1.0, 0.0014713996689350744, 0.8931909082312105), ('won', 't', 6.0, 1.0, 0.0012877115526122149, 0.875097348234103), ('ability', 'to', 5.0, 1.0, 0.0011039558417663294, 0.8530898329504303), ('reminds', 'me', 5.0, 1.0, 0.0011039558417663294, 0.8509287021533429), ('aware', 'of', 4.0, 1.0, 0.0009201324990798675, 0.8151649079085705), ('continued', 'to', 4.0, 1.0, 0.0009201324990798675, 0.8163622911880378), ('ended', 'up', 4.0, 1.0, 0.0009201324990798675, 0.8130840894046182), ('haven', 't', 4

In [22]:
df = pd.DataFrame(sorted_list,columns=["token","token+1","count","Probability","Smoothening-Laplace","Smoothening-Kneser-ney"])
df.head(5)

Unnamed: 0,token,token+1,count,Probability,Smoothening-Laplace,Smoothening-Kneser-ney
0,href,http,25.0,1.0,0.004766,0.970002
1,didn,t,18.0,1.0,0.003488,0.958366
2,sort,of,17.0,1.0,0.003305,0.956509
3,doesn,t,9.0,1.0,0.001839,0.916732
4,supposed,to,9.0,1.0,0.001839,0.918383


In [16]:
matrix=model.get_laplace_matrix()
matrix=matrix[1:,1:]

In [23]:
flattened_matrix = [(i, j, value) for i, row in enumerate(matrix2) for j, value in enumerate(row)]
top_5_values_with_positions = sorted(flattened_matrix, key=lambda x: x[2], reverse=True)[:5]

for position in top_5_values_with_positions:
    print(f"Value: {position[2]}, Position: ({position[0]}, {position[1]})")

Value: 0.11042412409155006, Position: (2322, 1736)
Value: 0.035092684307343996, Position: (1736, 2751)
Value: 0.03189066059225513, Position: (2322, 152)
Value: 0.026501766784452298, Position: (4767, 2322)
Value: 0.023100392270812144, Position: (176, 2322)


In [25]:
laplace = []
for position in top_5_values_with_positions:
    laplace.append((model.get_tokens()[position[0]+1], model.get_tokens()[position[1]+1], position[2]))

In [26]:
df = pd.DataFrame(laplace,columns=["token", "token+1", "Laplace-Probability"])
df.head(5)

Unnamed: 0,token,token+1,Laplace-Probability
0,i,feel,0.110424
1,feel,like,0.035093
2,i,am,0.031891
3,that,i,0.026502
4,and,i,0.0231


In [27]:
flattened_matrix = [(i, j, value) for i, row in enumerate(matrix3) for j, value in enumerate(row)]
top_5_values_with_positions = sorted(flattened_matrix, key=lambda x: x[2], reverse=True)[:5]

for position in top_5_values_with_positions:
    print(f"Value: {position[2]}, Position: ({position[0]}, {position[1]})")

kn = []
for position in top_5_values_with_positions:
    kn.append((model.get_tokens()[position[0]+1], model.get_tokens()[position[1]+1], position[2]))

Value: 0.970358782691682, Position: (1339, 4682)
Value: 0.9700023363576185, Position: (2285, 2288)
Value: 0.9583657827447011, Position: (1242, 4682)
Value: 0.9565093900961343, Position: (4411, 3270)
Value: 0.9183832405280168, Position: (4640, 4842)


Unnamed: 0,token,token+1,Kneser-Ney-Probability
0,i,feel,0.110424
1,feel,like,0.035093
2,i,am,0.031891
3,that,i,0.026502
4,and,i,0.0231


In [28]:
df = pd.DataFrame(kn,columns=["token","token+1","Kneser-ney-Probability"])
df.head(5)

Unnamed: 0,token,token+1,Kneser-ney-Probability
0,don,t,0.970359
1,href,http,0.970002
2,didn,t,0.958366
3,sort,of,0.956509
4,supposed,to,0.918383
