In [1]:
# !pip install transformers
# !pip install numpy
# !pip install tensorflow

In [81]:
import tensorflow
import numpy as np 
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

Some layers from the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:

class BigramLM:
    def __init__(self):
        self.vocab_size = 0
        self.vocabulary_index = {}
        self.word_count = {}
        self.index_vocabulary = {}
        self.bigram_counts = None
        self.bigram_probabilities = None
        self.dataset = None


    def build_corpus(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            sentences = [line.strip().split() for line in file]
        self.dataset =  sentences
    
    def build_vocab(self):
        for line in self.dataset:
            for word in line:
                if word not in self.vocabulary_index:
                    self.vocabulary_index[word] = self.vocab_size
                    self.index_vocabulary[self.vocab_size] = word
                    self.word_count[word] = 0
                    self.vocab_size += 1
                self.word_count[word] += 1
    def learn(self, file_path):
        # Build vocabulary and initialize bigram counts
        self.build_corpus(file_path)
        self.build_vocab()

        self.bigram_counts = np.zeros((self.vocab_size, self.vocab_size), dtype=int)

        for line in self.dataset:
            for index in range(len(line) - 1):
                first_word_index = self.vocabulary_index[line[index]]
                second_word_index = self.vocabulary_index[line[index + 1]]
                self.bigram_counts[first_word_index, second_word_index] += 1

    def calculate_probability(self, word1, word2):
        return self.bigram_counts[self.vocabulary_index[word1], self.vocabulary_index[word2]]/self.word_count[word1]
    
    def laplace_smoothing(self, word1, word2):
        return (self.bigram_counts[self.vocabulary_index[word1], self.vocabulary_index[word2]] + 1)/(self.word_count[word1] + self.vocab_size)

    def kneser_ney_smoothing(self, word1, word2, discount = 0):
        discounted_prob = max(self.bigram_counts[self.vocabulary_index[word1], self.vocabulary_index[word2]]-discount, 0)/self.word_count[word1]
        alpha_word1 = (discount* np.sum(self.bigram_counts[self.vocabulary_index[word1], :] > 0))/self.word_count[word1]
        cont_prob = np.sum(self.bigram_counts[:, self.vocabulary_index[word2]] > 0)/np.sum(self.bigram_counts > 0)
        print("alpha_word1", alpha_word1)
        print("discounted_prob",discounted_prob )
        print("self.word_count[word1]", self.word_count[word1])
        print("np.sum(self.bigram_counts[self.vocabulary_index[word1], :] > 0))", np.sum(self.bigram_counts[self.vocabulary_index[word1], :] > 0))
        print("np.sum(self.bigram_counts[:, self.vocabulary_index[word2]] > 0)", np.sum(self.bigram_counts[:, self.vocabulary_index[word2]] > 0))
        print("np.sum(self.bigram_counts > 0)", np.sum(self.bigram_counts > 0))

        return discounted_prob + alpha_word1*cont_prob
    
    def emotion_scores(self, sample): 
        emotion=classifier(sample)
        return emotion[0]

    def calculate_probability_emotion(self, word1, word2, emotion_id):
        emo_prob = self.emotion_scores(word1 + " " +word2)
        return self.bigram_counts[self.vocabulary_index[word1], self.vocabulary_index[word2]]/self.word_count[word1] + emo_prob[emotion_id]['score']
    
    def build_probability_matrix(self, mode, discount = 0, emotion_id = 0):
        self.bigram_probabilities =  np.zeros((self.vocab_size, self.vocab_size), dtype=float)
        if mode == 0:
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    self.bigram_probabilities[i, j] =  self.calculate_probability(self.index_vocabulary[i], self.index_vocabulary[j])

        elif mode == 1:
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    self.bigram_probabilities[i, j] =  self.laplace_smoothing(self.index_vocabulary[i], self.index_vocabulary[j])
                    
        elif mode == 2:
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    print(i, j)
                    self.bigram_probabilities[i, j] =  self.kneser_ney_smoothing(self.index_vocabulary[i], self.index_vocabulary[j], discount= discount) 

        else:
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    print(i, j)
                    self.bigram_probabilities[i, j] =  self.calculate_probability_emotion(self.index_vocabulary[i], self.index_vocabulary[j], emotion_id= emotion_id)                              
# Display the formed corpus


In [79]:
bigramLM = BigramLM()
bigramLM.learn("corpus.txt")


In [68]:
total = 5429*5429

In [69]:
bigramLM.vocabulary_index["feel"]

3

In [83]:
bigramLM.bigram_counts[0,113]

4

In [84]:
total - np.count_nonzero(bigramLM.bigram_counts == 0)

24436

In [85]:
bigramLM.emotion_scores('violent')
# 0 -> sadness
# 1 -> joy
# 2 ->'love
# 3 -> 'anger
# 4-> fear
# 5-> surprise


[{'label': 'sadness', 'score': 0.0006333347409963608},
 {'label': 'joy', 'score': 0.00038153710193000734},
 {'label': 'love', 'score': 0.00023734646674711257},
 {'label': 'anger', 'score': 0.9974260926246643},
 {'label': 'fear', 'score': 0.0011390680447220802},
 {'label': 'surprise', 'score': 0.00018260569777339697}]

In [86]:
print(bigramLM.vocabulary_index)
print(bigramLM.word_count["i"])
print(bigramLM.vocab_size)

# print(bigramLM.bigram_counts[0,2])
# print(bigramLM.calculate_probability("i","i"))
# print(bigramLM.laplace_smoothing("i","here"))
print(bigramLM.kneser_ney_smoothing("i","here", discount=0.5))
# print(bigramLM.calculate_probability_emotion("i", "feel", 0))

3789
5429
alpha_word1 0.05806281340723146
discounted_prob 0.0
self.word_count[word1] 3789
np.sum(self.bigram_counts[self.vocabulary_index[word1], :] > 0)) 440
np.sum(self.bigram_counts[:, self.vocabulary_index[word2]] > 0) 32
np.sum(self.bigram_counts > 0) 24436
7.603576808935205e-05


In [100]:
class BigramLM_efficient:
    def __init__(self):
        self.vocab_size = 0
        self.vocabulary_index = {}
        self.word_count = {}
        self.index_vocabulary = {}
        self.bigram_counts = None
        self.bigram_probabilities = None
        self.dataset = None


    def build_corpus(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            sentences = [line.strip().split() for line in file]
        self.dataset =  sentences
    
    def build_vocab(self):
        for line in self.dataset:
            for word in line:
                if word not in self.vocabulary_index:
                    self.vocabulary_index[word] = self.vocab_size
                    self.index_vocabulary[self.vocab_size] = word
                    self.word_count[word] = 0
                    self.vocab_size += 1
                self.word_count[word] += 1

    def build_probability_matrix(self, mode, discount=0, emotion_id=0):
        self.bigram_probabilities = np.zeros((self.vocab_size, self.vocab_size), dtype=float)

        if mode == 0:
            self.bigram_probabilities = self.calculate_probability_matrix()
        elif mode == 1:
            self.bigram_probabilities = self.laplace_smoothing_matrix()
        elif mode == 2:
            self.bigram_probabilities = self.kneser_ney_smoothing_matrix(discount=discount)
        # else:
        #     self.bigram_probabilities = self.calculate_probability_emotion_matrix(emotion_id=emotion_id)

    def calculate_probability_matrix(self):
        # Calculate bigram probabilities without smoothing or emotion
        return self.bigram_counts / self.word_count_matrix[:, np.newaxis]

    def laplace_smoothing_matrix(self):
        # Apply Laplace smoothing to bigram probabilities
        return (self.bigram_counts + 1) / (self.word_count_matrix[:, np.newaxis] + self.vocab_size)

    def kneser_ney_smoothing_matrix(self, discount=0):
        # Apply Kneser-Ney smoothing to bigram probabilities
        discounted_probs = np.maximum(self.bigram_counts - discount, 0) / self.word_count_matrix[:, np.newaxis]
        alpha_word1 = (discount * np.sum(self.bigram_counts > 0, axis=1)) / self.word_count_matrix
        cont_probs = np.sum(self.bigram_counts > 0, axis=0) / np.sum(self.bigram_counts > 0)
        return discounted_probs + alpha_word1[:, np.newaxis] * cont_probs

    def calculate_probability_emotion_row(self, first_wrod, emotion_id=0):
        # Calculate bigram probabilities with emotion scores
        non_zero_indices = np.nonzero(self.bigram_counts[self.vocabulary_index[first_wrod], :])[0]
        emo_probs = np.zeros((self.vocab_size, 6))
        for second_word_index in non_zero_indices:
            k = self.emotion_scores(first_wrod + " " +  self.index_vocabulary[second_word_index])
            prob_score = []
            for label_score in range(6):
                prob_score.append(k[label_score]['score'])
            emo_probs[second_word_index] = np.array(prob_score)
        first_word_mat = self.bigram_counts / self.word_count_matrix[:, np.newaxis]
        return first_word_mat[self.vocabulary_index[first_wrod],:] + emo_probs[:, emotion_id]
    
    # def calculate_emo_matrix(self, emotion_id = 0):
    #     emo_prob_matrix = np.zeros((self.vocab_size, self.vocab_size))
    #     for i in range(self.vocab_size):
    #         print(i)
    #         emo_prob_matrix[i] = self.calculate_probability_emotion_row(self.index_vocabulary[i], emotion_id)
    #     return emo_prob_matrix

    # def emotion_scores_matrix(self,emotion_id):
    #     # Calculate emotion scores for all bigrams
    #     all_bigrams = [f"{self.index_vocabulary[i]} {self.index_vocabulary[j]}" for i in range(self.vocab_size) for j in range(self.vocab_size)]
    #     num_bigrams = len(all_bigrams)
    #     # print(all_bigrams[0], len(all_bigrams))
    #     emo_probs = np.zeros((num_bigrams, 6))

    #     for idx, bigram in enumerate(all_bigrams):
    #         print(idx, "/", num_bigrams)
    #         k = self.emotion_scores(bigram)
    #         prob_score = []
    #         for label_score in range(6):
    #             prob_score.append(k[label_score]['score'])

    #         emo_probs[idx] =  np.array(prob_score)
    #         # print(emo_probs[idx])
    #         # print(prob_score)

    #     print(emo_probs.shape)        
    #     final =  emo_probs.reshape((self.vocab_size, self.vocab_size,6))
    #     np.save("emo_bigram.npy", final)
    #     return final

    def emotion_scores(self, sample): 
        emotion=classifier(sample)
        return emotion[0]

    def learn(self, file_path):
        # Build vocabulary and initialize bigram counts
        self.build_corpus(file_path)
        self.build_vocab()

        self.bigram_counts = np.zeros((self.vocab_size, self.vocab_size), dtype=int)
        self.word_count_matrix = np.array(list(self.word_count.values()))

        for line in self.dataset:
            for index in range(len(line) - 1):
                first_word_index = self.vocabulary_index[line[index]]
                second_word_index = self.vocabulary_index[line[index + 1]]
                self.bigram_counts[first_word_index, second_word_index] += 1
                
    def generate_samples(self, emotion_id = 0, num_samples = 50):
            generated_samples = []
            for _ in range(num_samples):
                sample = self.generate_sample(emotion_id)
                generated_samples.append(sample)
            return generated_samples

    def generate_sample(self, emotion_id = 0, max_length = 3):

        start_word = np.random.choice(['i', 'im', 'ive'], p=[0.7, 0.2, 0.1])
        current_word = start_word
        sample = [current_word]

        for _ in range(max_length - 1):
            current_word_index = self.vocabulary_index[current_word]

            probabilities = self.calculate_probability_emotion_row(current_word, emotion_id)

            # Check if all probabilities are zero
            if np.all(probabilities == 0):
                break

            # Normalize the probabilities only if they are not all zero
            probabilities /= probabilities.sum()

            # Sample the next word probabilistically
            next_word_index = np.random.choice(self.vocab_size, p=probabilities)
            next_word = self.index_vocabulary[next_word_index]

            # Append the next word to the sample
            sample.append(next_word)
            current_word = next_word

        return ' '.join(sample)                
# Example usage:
corpus_path = 'corpus.txt'
bigram_model = BigramLM_efficient()
bigram_model.learn(corpus_path)
# bigram_model.build_probability_matrix(mode=1, discount=0.5)


In [74]:
bigram_model.bigram_counts.shape

(5429, 5429)

In [75]:
total = 5429*5429


In [76]:
np.count_nonzero(bigram_model.bigram_counts==0)

29449605

In [102]:
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

for emotion in range(len(emotions)):
    generated_samples = bigram_model.generate_samples(num_samples = 10, emotion_id=emotion)
    output_file = f'gen_3_{emotions[emotion]}.txt'

    with open(output_file, 'w', encoding='utf-8') as file:
        for sample in generated_samples:
            file.write(sample + '\n')

In [64]:
bigram_model.emotion_scores("i caught in shock at whats so helpless in europe")

[{'label': 'sadness', 'score': 0.2016565352678299},
 {'label': 'joy', 'score': 0.0017053603660315275},
 {'label': 'love', 'score': 0.0007887427927926183},
 {'label': 'anger', 'score': 0.002077906858175993},
 {'label': 'fear', 'score': 0.7899318933486938},
 {'label': 'surprise', 'score': 0.003839528188109398}]

In [15]:
bigram_model.bigram_probabilities[0,2]

7.603576808935205e-05