In [None]:
!pip install transformers
!pip install numpy
!pip install tensorflow
!pip install scikit-learn

In [None]:
import tensorflow
import numpy as np
from transformers import pipeline
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

In [None]:
class BigramLM_efficient:
    def __init__(self):
        self.vocab_size = 0
        self.vocabulary_index = {}
        self.word_count = {}
        self.index_vocabulary = {}
        self.bigram_counts = None
        self.bigram_probabilities = None
        self.dataset = None


    def build_corpus(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            sentences = [line.strip().split() for line in file]
        self.dataset =  sentences

    def build_vocab(self):
        for line in self.dataset:
            for word in line:
                if word not in self.vocabulary_index:
                    self.vocabulary_index[word] = self.vocab_size
                    self.index_vocabulary[self.vocab_size] = word
                    self.word_count[word] = 0
                    self.vocab_size += 1
                self.word_count[word] += 1

    def build_probability_matrix(self, mode, discount=0, emotion_id=0):
        self.bigram_probabilities = np.zeros((self.vocab_size, self.vocab_size), dtype=float)

        if mode == 0:
            self.bigram_probabilities = self.calculate_probability_matrix()
        elif mode == 1:
            self.bigram_probabilities = self.laplace_smoothing_matrix()
        elif mode == 2:
            self.bigram_probabilities = self.kneser_ney_smoothing_matrix(discount=discount)


    def calculate_probability_matrix(self):
        return self.bigram_counts / self.word_count_matrix[:, np.newaxis]

    def laplace_smoothing_matrix(self):
        return (self.bigram_counts + 1) / (self.word_count_matrix[:, np.newaxis] + self.vocab_size)

    def kneser_ney_smoothing_matrix(self, discount=0):
        discounted_probs = np.maximum(self.bigram_counts - discount, 0) / self.word_count_matrix[:, np.newaxis]
        alpha_word1 = (discount * np.sum(self.bigram_counts > 0, axis=1)) / self.word_count_matrix
        cont_probs = np.sum(self.bigram_counts > 0, axis=0) / np.sum(self.bigram_counts > 0)
        return discounted_probs + alpha_word1[:, np.newaxis] * cont_probs

    def calculate_probability_emotion_row(self, first_wrod, emotion_id=0):
        non_zero_indices = np.nonzero(self.bigram_counts[self.vocabulary_index[first_wrod], :])[0]
        emo_probs = np.zeros((self.vocab_size, 6))
        for second_word_index in non_zero_indices:
            k = self.emotion_scores(first_wrod + " " +  self.index_vocabulary[second_word_index])
            prob_score = []
            for label_score in range(6):
                prob_score.append(k[label_score]['score'])
            emo_probs[second_word_index] = np.array(prob_score)
        first_word_mat = self.bigram_counts / self.word_count_matrix[:, np.newaxis]
        return first_word_mat[self.vocabulary_index[first_wrod],:] + emo_probs[:, emotion_id]

    def emotion_scores(self, sample):
        emotion=classifier(sample)
        return emotion[0]

    def learn(self, file_path):
        self.build_corpus(file_path)
        self.build_vocab()

        self.bigram_counts = np.zeros((self.vocab_size, self.vocab_size), dtype=int)
        self.word_count_matrix = np.array(list(self.word_count.values()))

        for line in self.dataset:
            for index in range(len(line) - 1):
                first_word_index = self.vocabulary_index[line[index]]
                second_word_index = self.vocabulary_index[line[index + 1]]
                self.bigram_counts[first_word_index, second_word_index] += 1

    def generate_samples(self, emotion_id = 0, num_samples = 50):
            generated_samples = []
            for _ in range(num_samples):
                sample = self.generate_sample(emotion_id)
                generated_samples.append(sample)
            return generated_samples

    def generate_sample(self, emotion_id = 0, max_length = 10):

        start_word = np.random.choice(['i', 'im', 'ive'], p = [0.7, 0.2, 0.1])
        current_word = start_word
        sample = [current_word]

        for _ in range(max_length - 1):

            current_word_index = self.vocabulary_index[current_word]
            probabilities = self.calculate_probability_emotion_row(current_word, emotion_id)

            if np.all(probabilities == 0):
                break

            probabilities /= probabilities.sum()

            next_word_index = np.random.choice(self.vocab_size, p = probabilities)
            next_word = self.index_vocabulary[next_word_index]

            sample.append(next_word)
            current_word = next_word

        return ' '.join(sample)


    def find_top_bigrams(self, num_top_bigrams=5):
        top_bigrams = []
        bigram_prob_scores = []

        for i in range(self.vocab_size):
            for j in range(self.vocab_size):
                bigram = f"{self.index_vocabulary[i]} {self.index_vocabulary[j]}"
                bigram_prob = self.bigram_probabilities[i, j]
                bigram_prob_scores.append((bigram, bigram_prob))

        sorted_bigrams = sorted(bigram_prob_scores, key=lambda x: x[1], reverse=True)

        top_bigrams = sorted_bigrams[:num_top_bigrams]

        return top_bigrams

In [None]:
corpus_path = 'corpus.txt'
bigram_model = BigramLM_efficient()
bigram_model.learn(corpus_path)

In [None]:
bigram_model.build_probability_matrix(0)
top_bigrams = bigram_model.find_top_bigrams()
print("Top 5 Bigrams (Before smoothing):")
print()
for bigram, prob in top_bigrams:
    print(f"Bigram: '{bigram}',   Probability: {prob:.4f}")

Top 5 Bigrams (Before smoothing):

Bigram: 'href http',   Probability: 1.0000
Bigram: 'tychelle to',   Probability: 1.0000
Bigram: 'hang out',   Probability: 1.0000
Bigram: 'nonexistent social',   Probability: 1.0000
Bigram: 'alex and',   Probability: 1.0000


In [None]:
bigram_model.build_probability_matrix(1)
top_bigrams = bigram_model.find_top_bigrams()
print("Top 5 Bigrams (After Laplace smoothing):")
print()
for bigram, prob in top_bigrams:
    print(f"Bigram: '{bigram}',   Probability: {prob:.4f}")

Top 5 Bigrams (After Laplace smoothing):

Bigram: 'i feel',   Probability: 0.1104
Bigram: 'feel like',   Probability: 0.0351
Bigram: 'i am',   Probability: 0.0319
Bigram: 'that i',   Probability: 0.0265
Bigram: 'and i',   Probability: 0.0231


In [None]:
bigram_model.build_probability_matrix(2, 0.5)
top_bigrams = bigram_model.find_top_bigrams()
print("Top 5 Bigrams (After Kneser Ney smoothing):")
print()
for bigram, prob in top_bigrams:
    print(f"Bigram: '{bigram}',   Probability: {prob:.4f}")

Top 5 Bigrams (After Kneser Ney smoothing):

Bigram: 'href http',   Probability: 0.9800
Bigram: 'don t',   Probability: 0.9746
Bigram: 'didn t',   Probability: 0.9722
Bigram: 'sort of',   Probability: 0.9710
Bigram: 'supposed to',   Probability: 0.9456


In [None]:
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

for emotion in range(len(emotions)):
    generated_samples = bigram_model.generate_samples(num_samples = 50, emotion_id=emotion)
    output_file = f'gen_{emotions[emotion]}.txt'

    with open(output_file, 'w', encoding='utf-8') as file:
        for sample in generated_samples:
            file.write(sample + '\n')

In [None]:
corpus_path = 'corpus.txt'
labels_path = 'labels.txt'

with open(corpus_path, 'r', encoding='utf-8') as file:
    texts = [line.strip() for line in file]

with open(labels_path, 'r', encoding='utf-8') as file:
    labels = [line.strip() for line in file]


X_train, y_train = texts, labels

emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
testing_samples = []

X_test = []
y_test = []

for emotion in emotions:
    emotion_file_path = f'gen_{emotion}.txt'
    with open(emotion_file_path, 'r', encoding='utf-8') as file:
        emotion_samples = [line.strip() for line in file]

        X_test.extend(emotion_samples)
        y_test.extend([emotion] * len(emotion_samples))

y_test = list(y_test)

In [None]:
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

svc_model = SVC(kernel = 'linear', C = 120, gamma = 0.002, break_ties = True, probability = True)
svc_model.fit(X_train_tfidf, y_train)

y_pred = svc_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy * 100)

Accuracy: 74.0


In [None]:
param_grid = {
    'C': [100, 115, 125],
    'gamma': [0.001, 0.007, 0.0096],
    'kernel': ['linear', 'rbf']
}

svc_model = SVC(break_ties = True, probability = True)

grid_search = GridSearchCV(estimator = svc_model, param_grid = param_grid, cv = 5, scoring = 'accuracy')

grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_grid = best_model.predict(X_test_tfidf)

accuracy_grid = accuracy_score(y_test, y_pred_grid)
classification_rep = classification_report(y_test, y_pred_grid)

print("Best Parameters:", best_params)
print("Accuracy with Grid Search:", accuracy_grid * 100)
print("Classification Report:\n", classification_rep)

Best Parameters: {'C': 115, 'gamma': 0.0096, 'kernel': 'rbf'}
Accuracy with Grid Search: 75.33333333333333
Classification Report:
               precision    recall  f1-score   support

       anger       0.84      0.42      0.56        50
        fear       0.89      0.68      0.77        50
         joy       0.59      0.68      0.63        50
        love       0.85      0.90      0.87        50
     sadness       0.59      0.86      0.70        50
    surprise       0.92      0.98      0.95        50

    accuracy                           0.75       300
   macro avg       0.78      0.75      0.75       300
weighted avg       0.78      0.75      0.75       300

