## Imports

In [13]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
from bigram import BigramLM
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from utils import emotion_scores
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Model Setup

In [5]:
corpus_file_path = '../data/corpus.txt'

with open(corpus_file_path, 'r') as file:
    lines = file.readlines()

modified_lines = ['$ ' + line.strip() for line in lines]
joined_string = '\n'.join(modified_lines)
corpus = joined_string

In [6]:
model = BigramLM(corpus)
model.set_tokens()
model.calculate_bigrams()
model.set_laplace_matrix()
model.set_kn_matrix(d=0.75)

Model initialized 🟢
Tokens set 🟢


Populating bigram matrix...: 100%|██████████| 48931/48931 [00:00<00:00, 539183.28it/s]


All matrices calculated 🟢
Calculating Kneser-Ney matrix...
Kneser-Ney matrix calculated 🟢


In [175]:
# temp=model.get_normal_matrix()
# for i in range(5430):
#     if temp[i,0]==1:
#         print(model.get_tokens()[i])

## Generating Emotion for all non-zero Bigrams

In [80]:
tokens = model.get_tokens()
count_matrix = model.get_count_matrix()

In [29]:
emotions = {}
for i, token in tqdm(enumerate(tokens)):
    for j, token2 in enumerate(tokens):
        if count_matrix[i][j] > 0 and token!='$' and token2!="$":
            emotions[(token, token2)] = emotion_scores(str(token + " " + token2))
        if count_matrix[i][j] > 0 and token=='$' and token2!="$":
            emotions[(token, token2)]= emotion_scores(str(token2))
        if count_matrix[i][j] > 0 and token!='$' and token2=="$":
            emotions[(token, token2)]= emotion_scores(str(token))

with open('emotions.pkl', 'wb') as f:
    pickle.dump(emotions, f)

5430it [12:24,  7.29it/s]


In [71]:
emo = pickle.load(open("pickle_files/emotions.pkl", "rb"))

## Generating 50 Sentences each of every emotion


In [44]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
for i in emotions:
    # used count matrix use some other matrix like laplace or kneser-ney
    model.generate_sentences(model.get_kn_matrix(), emotion=i, word_limit=30, no_of_sentences=50, alpha = 1, beta = 4)

Generating sadness matrix: 5430it [00:18, 287.89it/s]
Generating sentences: 100%|██████████| 50/50 [00:03<00:00, 13.74it/s]


Sentences generated and stored 🟢


Generating joy matrix: 5430it [00:17, 304.53it/s]
Generating sentences: 100%|██████████| 50/50 [00:04<00:00, 12.26it/s]


Sentences generated and stored 🟢


Generating love matrix: 5430it [00:17, 308.38it/s]
Generating sentences: 100%|██████████| 50/50 [00:02<00:00, 18.73it/s]


Sentences generated and stored 🟢


Generating anger matrix: 5430it [00:16, 332.74it/s]
Generating sentences: 100%|██████████| 50/50 [00:03<00:00, 16.42it/s]


Sentences generated and stored 🟢


Generating fear matrix: 5430it [00:18, 299.67it/s]
Generating sentences: 100%|██████████| 50/50 [00:03<00:00, 13.55it/s]


Sentences generated and stored 🟢


Generating surprise matrix: 5430it [00:16, 328.20it/s]
Generating sentences: 100%|██████████| 50/50 [00:03<00:00, 15.89it/s]

Sentences generated and stored 🟢





## Training SVC Classifier

In [36]:
corpus_file = "..\data\corpus.txt"
labels_file = "..\data\labels.txt"

with open(corpus_file) as f:
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(f)

with open(labels_file) as l:
    y = l.read().splitlines()

In [37]:
parameters = {'kernel': ('poly', 'rbf'), 'C': [2, 3], 'gamma': ['auto', 'scale'], "degree": [2, 3]}
clf = GridSearchCV(SVC(), parameters, n_jobs=4, refit=True)
clf.fit(X, y)
best_model=clf.best_estimator_
print("best parameters:", clf.best_params_)
print("best score:", clf.best_score_)

best parameters: {'C': 2, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
best score: 0.70875


In [38]:
print(best_model.score(X, y))

0.9991666666666666


In [48]:
emotions=["sadness", "joy", "love", "anger", "fear", "surprise"]
dataset = {}
for emotion in emotions:
    path = ".\emotion_text\gen_"+emotion+".txt"
    with open(path, 'r') as file:
        dataset[emotion] = file.read().splitlines()

In [49]:
def emotion_accuracy(emotion, dataset, best_model):
    X = vectorizer.transform(dataset[emotion])
    y = [emotion]*len(dataset[emotion])
    return best_model.score(X, y)

for emotion in emotions:
    print(emotion, emotion_accuracy(emotion, dataset, best_model)) 
   
print("Average accuracy: ", np.mean([emotion_accuracy(emotion, dataset, best_model) for emotion in emotions]))

sadness 0.96
joy 0.66
love 1.0
anger 0.46
fear 0.86
surprise 1.0
Average accuracy:  0.8233333333333333


In [57]:
matrix = model.get_count_matrix()
matrix = matrix[1:, 1:]
top_5_indices = np.argpartition(matrix.flatten(), -5)[-5:]

# Sort indices and corresponding values together
top_5_indices_sorted = top_5_indices[np.argsort(matrix.flatten()[top_5_indices])]
top_5_values = matrix.flatten()[top_5_indices_sorted]

# Convert flattened indices back to matrix indices
top_5_row_indices, top_5_col_indices = np.unravel_index(top_5_indices_sorted, matrix.shape)

# Print results
print("Top 5 max values:", top_5_values)
print("Their indices:", list(zip(top_5_row_indices, top_5_col_indices)))

Top 5 max values: [ 164.  201.  247.  293. 1017.]
Their indices: [(4767, 2322), (2322, 5194), (1736, 2751), (2322, 152), (2322, 1736)]


In [58]:
for i in range(5):
    print(model.get_tokens()[top_5_row_indices[i]], model.get_tokens()[top_5_col_indices[i]])

thanksgiving hypocrite
hypocrite wars
feefyefo lights
hypocrite always
hypocrite feefyefo
