## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
from bigram import BigramLM
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from utils import emotion_scores
import warnings
warnings.filterwarnings('ignore')



## Model Setup

In [4]:
corpus_file_path = '../data/corpus.txt'

# Read the content of the file
with open(corpus_file_path, 'r') as file:
    lines = file.readlines()

# Add a "$" sign to the beginning of each line
modified_lines = ['$ ' + line.strip() for line in lines]
joined_string = '\n'.join(modified_lines)

In [6]:
corpus=joined_string

In [30]:
model = BigramLM(corpus)
model.set_token()
model.calculate_bigrams()
laplace_matrix = model.get_laplace_matrix()
kn_matrix = model.set_kn_matrix(d=0.75)

Model Initialized 🟢
Tokens Set 🟢


Populating Bigram Matrix...: 100%|██████████| 48931/48931 [00:00<00:00, 464032.78it/s]


All Matrices Calculated 🟢


Calculating Kneser-Ney Matrix...: 100%|██████████| 5430/5430 [01:41<00:00, 53.57it/s]


## Generating Emotion for all non-zero Bigrams

In [28]:
tokens = model.get_token()
count_matrix = model.get_count_matrix()

In [29]:
emotions = {}
for i, token in tqdm(enumerate(tokens)):
    for j, token2 in enumerate(tokens):
        if count_matrix[i][j] > 0 and token!='$' and token2!="$":
            emotions[(token, token2)] = emotion_scores(str(token + " " + token2))
        if count_matrix[i][j] > 0 and token=='$' and token2!="$":
            emotions[(token, token2)]= emotion_scores(str(token2))
        if count_matrix[i][j] > 0 and token!='$' and token2=="$":
            emotions[(token, token2)]= emotion_scores(str(token))

with open('emotions.pkl', 'wb') as f:
    pickle.dump(emotions, f)

5430it [12:24,  7.29it/s]


In [21]:
emo = pickle.load(open("pickle_files/emotions.pkl", "rb"))

In [22]:
emo

{('$', 'a'): [{'label': 'sadness', 'score': 0.136471226811409},
  {'label': 'joy', 'score': 0.45315712690353394},
  {'label': 'love', 'score': 0.02117856778204441},
  {'label': 'anger', 'score': 0.3249291479587555},
  {'label': 'fear', 'score': 0.04795915633440018},
  {'label': 'surprise', 'score': 0.016304833814501762}],
 ('$', 'during'): [{'label': 'sadness', 'score': 0.136471226811409},
  {'label': 'joy', 'score': 0.45315712690353394},
  {'label': 'love', 'score': 0.02117856778204441},
  {'label': 'anger', 'score': 0.3249291479587555},
  {'label': 'fear', 'score': 0.04795915633440018},
  {'label': 'surprise', 'score': 0.016304833814501762}],
 ('$', 'heated'): [{'label': 'sadness', 'score': 0.136471226811409},
  {'label': 'joy', 'score': 0.45315712690353394},
  {'label': 'love', 'score': 0.02117856778204441},
  {'label': 'anger', 'score': 0.3249291479587555},
  {'label': 'fear', 'score': 0.04795915633440018},
  {'label': 'surprise', 'score': 0.016304833814501762}],
 ('$', 'i'): [{'la

## Generating 50 Sentences each of every emotion


In [34]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
for i in emotions:
    # used count matrix use some other matrix like laplace or kneser-ney
    model.generate_sentences(model.get_kn_matrix(), emotion=i, word_limit=15, no_of_sentences=50, alpha = 1, beta = 0.5)

Generating sadness Matix: 5430it [00:18, 288.36it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 23.03it/s]


Sentences Generated + Stored 🟢


Generating joy Matix: 5430it [00:19, 276.81it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 24.07it/s]


Sentences Generated + Stored 🟢


Generating love Matix: 5430it [00:18, 288.77it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 22.47it/s]


Sentences Generated + Stored 🟢


Generating anger Matix: 5430it [00:19, 272.87it/s]
Generating Sentence: 100%|██████████| 50/50 [00:01<00:00, 26.58it/s]


Sentences Generated + Stored 🟢


Generating fear Matix: 5430it [00:18, 295.44it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 22.92it/s]


Sentences Generated + Stored 🟢


Generating surprise Matix: 5430it [00:17, 312.09it/s]
Generating Sentence: 100%|██████████| 50/50 [00:02<00:00, 23.95it/s]

Sentences Generated + Stored 🟢





## Training SVC Classifier

In [35]:
corpus_file = "..\data\corpus.txt"
labels_file = "..\data\labels.txt"
with open(corpus_file) as f:
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(f)
    fts = vectorizer.get_feature_names_out()
# print(X)

with open(labels_file) as l:
    y = l.read().splitlines()

In [36]:
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 3, 5], 'gamma': [0.1, 0.05, 0.01], "degree": [2, 3, 5], 'tol': [0.001, 0.0001]}
clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale')
# clf = GridSearchCV(svc, parameters, n_jobs=4)
clf.fit(X, y)

In [146]:
# print(clf.best_params_)
# print(clf.best_score_)

In [37]:
clf.predict(X)
print(clf.score(X, y))

0.9966666666666667


In [38]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
dataset = {}
for emotion in emotions:
    path = ".\emotion_text\gen_"+emotion+".txt"
    with open(path, 'r') as file:
        dataset[emotion] = file.read().splitlines()

In [39]:
def emotion_accuracy(emotion, dataset, clf):
    X = vectorizer.transform(dataset[emotion])
    y = [emotion]*len(dataset[emotion])
    return clf.score(X, y)

for emotion in emotions:
    print(emotion, emotion_accuracy(emotion, dataset, clf)) 
   
print("Average accuracy: ", np.mean([emotion_accuracy(emotion, dataset, clf) for emotion in emotions]))

sadness 0.58
joy 0.7
love 0.88
anger 0.36
fear 0.54
surprise 0.78
Average accuracy:  0.64
