## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
from bigram import BigramLM
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from utils import emotion_scores
import warnings
warnings.filterwarnings('ignore')



## Model Setup

In [76]:
corpus_file_path = '../data/corpus.txt'

# Read the content of the file
with open(corpus_file_path, 'r') as file:
    lines = file.readlines()

# Add a "$" sign to the beginning of each line
modified_lines = ['$ ' + line.strip() for line in lines]
joined_string = '\n'.join(modified_lines)

In [77]:
corpus=joined_string

In [78]:
model = BigramLM(corpus)
model.set_token()
model.calculate_bigrams()
laplace_matrix = model.get_laplace_matrix()
kn_matrix = model.set_kn_matrix(d=0.75)

Model Initialized 🟢
Tokens Set 🟢


Populating Bigram Matrix...: 100%|██████████| 48931/48931 [00:00<00:00, 493398.30it/s]


All Matrices Calculated 🟢


Calculating Kneser-Ney Matrix...: 100%|██████████| 5430/5430 [00:57<00:00, 94.11it/s] 


## Generating Emotion for all non-zero Bigrams

In [80]:
tokens = model.get_token()
count_matrix = model.get_count_matrix()

In [81]:
tokens

['$',
 'a',
 'aa',
 'aahhh',
 'abandoning',
 'abilities',
 'ability',
 'abit',
 'able',
 'abound',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abt',
 'abundance',
 'abused',
 'abuses',
 'abusive',
 'abyss',
 'academic',
 'academics',
 'accelerated',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accessaries',
 'accessibility',
 'accident',
 'accidentally',
 'accomplished',
 'accomplishing',
 'accordance',
 'according',
 'account',
 'accusing',
 'accustomed',
 'ace',
 'ache',
 'ached',
 'achieve',
 'achieved',
 'achieving',
 'aching',
 'acne',
 'acquainted',
 'acronym',
 'across',
 'act',
 'action',
 'actions',
 'active',
 'actively',
 'activist',
 'activities',
 'activity',
 'acts',
 'actual',
 'actually',
 'ad',
 'adams',
 'adapt',
 'add',
 'added',
 'addiction',
 'addictive',
 'adjust',
 'administration',
 'admiration',
 'admire',
 'admired',
 'admiring',
 'admit',
 'admitting',
 'adn',
 'adolescence',
 'adomen',
 'adopt',
 'adore',
 'adoring',

In [29]:
emotions = {}
for i, token in tqdm(enumerate(tokens)):
    for j, token2 in enumerate(tokens):
        if count_matrix[i][j] > 0 and token!='$' and token2!="$":
            emotions[(token, token2)] = emotion_scores(str(token + " " + token2))
        if count_matrix[i][j] > 0 and token=='$' and token2!="$":
            emotions[(token, token2)]= emotion_scores(str(token2))
        if count_matrix[i][j] > 0 and token!='$' and token2=="$":
            emotions[(token, token2)]= emotion_scores(str(token))

with open('emotions.pkl', 'wb') as f:
    pickle.dump(emotions, f)

5430it [12:24,  7.29it/s]


In [71]:
emo = pickle.load(open("pickle_files/emotions.pkl", "rb"))

## Generating 50 Sentences each of every emotion


In [82]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
for i in emotions:
    # used count matrix use some other matrix like laplace or kneser-ney
    model.generate_sentences(model.get_kn_matrix(), emotion=i, word_limit=15, no_of_sentences=50, alpha = 1, beta = 0.5)

Generating sadness Matix: 5430it [00:19, 280.76it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 20.26it/s]


Sentences Generated + Stored 🟢


Generating joy Matix: 5430it [00:20, 261.55it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 23.90it/s]


Sentences Generated + Stored 🟢


Generating love Matix: 5430it [00:19, 277.66it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 22.76it/s]


Sentences Generated + Stored 🟢


Generating anger Matix: 5430it [00:18, 291.95it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 24.36it/s]


Sentences Generated + Stored 🟢


Generating fear Matix: 5430it [00:20, 269.13it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 25.35it/s]


Sentences Generated + Stored 🟢


Generating surprise Matix: 5430it [00:20, 260.45it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 22.00it/s]

Sentences Generated + Stored 🟢





## Training SVC Classifier

In [35]:
corpus_file = "..\data\corpus.txt"
labels_file = "..\data\labels.txt"
with open(corpus_file) as f:
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(f)
    fts = vectorizer.get_feature_names_out()
# print(X)

with open(labels_file) as l:
    y = l.read().splitlines()

In [36]:
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 3, 5], 'gamma': [0.1, 0.05, 0.01], "degree": [2, 3, 5], 'tol': [0.001, 0.0001]}
clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale')
# clf = GridSearchCV(svc, parameters, n_jobs=4)
clf.fit(X, y)

In [146]:
# print(clf.best_params_)
# print(clf.best_score_)

In [37]:
clf.predict(X)
print(clf.score(X, y))

0.9966666666666667


In [83]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
dataset = {}
for emotion in emotions:
    path = ".\emotion_text\gen_"+emotion+".txt"
    with open(path, 'r') as file:
        dataset[emotion] = file.read().splitlines()

In [84]:
def emotion_accuracy(emotion, dataset, clf):
    X = vectorizer.transform(dataset[emotion])
    y = [emotion]*len(dataset[emotion])
    return clf.score(X, y)

for emotion in emotions:
    print(emotion, emotion_accuracy(emotion, dataset, clf)) 
   
print("Average accuracy: ", np.mean([emotion_accuracy(emotion, dataset, clf) for emotion in emotions]))

sadness 0.74
joy 0.68
love 0.96
anger 0.42
fear 0.58
surprise 0.84
Average accuracy:  0.7033333333333333
