# Imports

In [8]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
import bigram
from utils import emotion_scores
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Model Setup

In [3]:
corpus_file_path = '../data/corpus.txt'
corpus = open(corpus_file_path, 'r').read()
print(len(corpus))

238101


In [125]:
model=bigram.BigramLM(corpus)
model.set_token()
model.calculate_bigrams()
laplace_matrix = model.get_laplace_matrix()
kn_matrix = model.set_kn_matrix(d = 0.70)

Model Initialized 🟢
Tokens Set 🟢


Populating Bigram Matrix...: 100%|██████████| 46531/46531 [00:00<00:00, 937091.14it/s]


All Matrices Calculated 🟢


Calculating Kneser-Ney Matrix...: 100%|██████████| 5429/5429 [00:26<00:00, 204.78it/s]


# Generating Emotion for all non-zero Bigrams

In [89]:
tokens = model.get_token()
count_matrix = model.get_count_matrix()

In [90]:
emotions = {}
for i, token in tqdm(enumerate(tokens)):
    for j, token2 in enumerate(tokens):
        if count_matrix[i][j] > 0:
            emotions[(token, token2)] = emotion_scores(str(token + " " + token2))

with open('emotions.pkl', 'wb') as f:
    pickle.dump(emotions, f)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

# Generating 50 Sentences each of every emotion


In [126]:
emotions=["sadness","joy","love","anger","fear","surprise"]
for i in emotions:
    #used count matrix use some other matrix like laplace or kneser-nay
    model.generate_sentences(model.get_kn_matrix(),emotion=i,word_limit=15,no_of_sentences=50,alpha = 0.65,beta = 0.35)

Generating sadness Matix: 5429it [00:08, 629.21it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 40.13it/s]


Sentences Generated + Stored 🟢


Generating joy Matix: 5429it [00:09, 562.15it/s]
Generating Sentence: 100%|██████████| 50/50 [00:00<00:00, 69.33it/s]


Sentences Generated + Stored 🟢


Generating love Matix: 5429it [00:08, 649.63it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 43.15it/s]


Sentences Generated + Stored 🟢


Generating anger Matix: 5429it [00:09, 546.09it/s]
Generating Sentence: 100%|██████████| 50/50 [00:00<00:00, 75.28it/s]


Sentences Generated + Stored 🟢


Generating fear Matix: 5429it [00:10, 529.05it/s]
Generating Sentence: 100%|██████████| 50/50 [00:00<00:00, 66.86it/s]


Sentences Generated + Stored 🟢


Generating surprise Matix: 5429it [00:07, 693.46it/s]
Generating Sentence: 100%|██████████| 50/50 [00:00<00:00, 77.17it/s]

Sentences Generated + Stored 🟢





# Training SVC Classifier


In [21]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [137]:
corpus_file = "..\data\corpus.txt"
labels_file = "..\data\labels.txt"
with open(corpus_file) as f:
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(f)
    fts = vectorizer.get_feature_names_out()
print(X)

with open(labels_file) as l:
    y = l.read().splitlines()

  (0, 2930)	0.43084376016925796
  (0, 2174)	0.2848855643879678
  (0, 2171)	0.29425228557844385
  (0, 2632)	0.40925149096889096
  (0, 953)	0.38204844892597783
  (0, 3418)	0.32354395172168304
  (0, 739)	0.32721803519831894
  (0, 1653)	0.07607942149964549
  (0, 4271)	0.34006065950204906
  (1, 3245)	0.28043969593136264
  (1, 2608)	0.17304821946074012
  (1, 4166)	0.2749630976018053
  (1, 3049)	0.3318199702304946
  (1, 5093)	0.2942396215728463
  (1, 3624)	0.25201125961701804
  (1, 2016)	0.3318199702304946
  (1, 4928)	0.2749630976018053
  (1, 4732)	0.3318199702304946
  (1, 4536)	0.30339153391615
  (1, 2467)	0.27882214637965586
  (1, 2648)	0.2942396215728463
  (1, 1653)	0.058593563864655224
  (2, 3278)	0.21368799149857234
  (2, 3628)	0.20259410773182832
  (2, 956)	0.20259410773182832
  :	:
  (2395, 1653)	0.09678784003363007
  (2396, 4036)	0.33238746091528043
  (2396, 1082)	0.4728322324776654
  (2396, 1325)	0.4209982460275356
  (2396, 3570)	0.31671943681872267
  (2396, 413)	0.3927566631676539
 

In [138]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [139]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10], 'gamma':[0.1, 0.01], "degree": [2,3] }
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X, y)

In [140]:
print(clf.best_params_)
print(clf.best_score_)

{'C': 1, 'degree': 2, 'gamma': 0.1, 'kernel': 'linear'}
0.8141666666666667


In [141]:
emotions=["sadness","joy","love","anger","fear","surprise"]
dataset = {}
for emotion in emotions:
    path = ".\emotion_text\gen_"+emotion+".txt"
    with open(path, 'r') as file:
        dataset[emotion] = file.read().splitlines()
            
        

In [142]:
def emotion_accuracy(emotion, dataset, clf):
    X = vectorizer.transform(dataset[emotion])
    y = [emotion]*len(dataset[emotion])
    return clf.score(X, y)

for emotion in emotions:
    print(emotion, emotion_accuracy(emotion, dataset, clf)) 
   
print("Average accuracy: ", np.mean([emotion_accuracy(emotion, dataset, clf) for emotion in emotions]))

sadness 0.62
joy 0.48
love 0.96
anger 0.42
fear 0.44
surprise 0.86
Average accuracy:  0.63
