## Imports

In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
from bigram import BigramLM
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from utils import emotion_scores
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Model Setup

In [58]:
corpus_file_path = '../data/corpus.txt'
corpus = open(corpus_file_path, 'r').read()

In [6]:
model = BigramLM(corpus)
model.set_token()
model.calculate_bigrams()
laplace_matrix = model.get_laplace_matrix()
kn_matrix = model.set_kn_matrix(d=0.75)

Model Initialized 🟢
Tokens Set 🟢


Populating Bigram Matrix...: 100%|██████████| 46531/46531 [00:00<00:00, 664416.91it/s]


All Matrices Calculated 🟢


Calculating Kneser-Ney Matrix...: 100%|██████████| 5429/5429 [00:38<00:00, 139.90it/s]


## Generating Emotion for all non-zero Bigrams

In [7]:
tokens = model.get_token()
count_matrix = model.get_count_matrix()

In [None]:
emotions = {}
for i, token in tqdm(enumerate(tokens)):
    for j, token2 in enumerate(tokens):
        if count_matrix[i][j] > 0:
            emotions[(token, token2)] = emotion_scores(str(token + " " + token2))

with open('emotions.pkl', 'wb') as f:
    pickle.dump(emotions, f)

## Generating 50 Sentences each of every emotion


In [87]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
for i in emotions:
    # used count matrix use some other matrix like laplace or kneser-ney
    model.generate_sentences(model.get_kn_matrix(), emotion=i, word_limit=15, no_of_sentences=50, alpha = 1, beta = 0.5)

Generating sadness Matix: 5429it [00:13, 406.93it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 45.72it/s]


Sentences Generated + Stored 🟢


Generating joy Matix: 5429it [00:13, 398.17it/s]
Generating Sentence: 100%|██████████| 50/50 [00:00<00:00, 50.32it/s]


Sentences Generated + Stored 🟢


Generating love Matix: 5429it [00:13, 400.12it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 33.74it/s]


Sentences Generated + Stored 🟢


Generating anger Matix: 5429it [00:13, 405.54it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 36.49it/s]


Sentences Generated + Stored 🟢


Generating fear Matix: 5429it [00:13, 401.88it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 44.13it/s]


Sentences Generated + Stored 🟢


Generating surprise Matix: 5429it [00:13, 412.59it/s]
Generating Sentence: 100%|██████████| 50/50 [00:00<00:00, 52.08it/s]

Sentences Generated + Stored 🟢





## Training SVC Classifier

In [88]:
corpus_file = "..\data\corpus.txt"
labels_file = "..\data\labels.txt"
with open(corpus_file) as f:
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(f)
    fts = vectorizer.get_feature_names_out()
# print(X)

with open(labels_file) as l:
    y = l.read().splitlines()

In [145]:
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 3, 5], 'gamma': [0.1, 0.05, 0.01], "degree": [2, 3, 5], 'tol': [0.001, 0.0001]}
clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale')
# clf = GridSearchCV(svc, parameters, n_jobs=4)
clf.fit(X, y)

In [146]:
# print(clf.best_params_)
# print(clf.best_score_)

In [147]:
clf.predict(X)
print(clf.score(X, y))

0.9966666666666667


In [148]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
dataset = {}
for emotion in emotions:
    path = ".\emotion_text\gen_"+emotion+".txt"
    with open(path, 'r') as file:
        dataset[emotion] = file.read().splitlines()

In [149]:
def emotion_accuracy(emotion, dataset, clf):
    X = vectorizer.transform(dataset[emotion])
    y = [emotion]*len(dataset[emotion])
    return clf.score(X, y)

for emotion in emotions:
    print(emotion, emotion_accuracy(emotion, dataset, clf)) 
   
print("Average accuracy: ", np.mean([emotion_accuracy(emotion, dataset, clf) for emotion in emotions]))

sadness 0.7
joy 0.64
love 0.86
anger 0.42
fear 0.4
surprise 0.8
Average accuracy:  0.6366666666666666
