In [1]:
import numpy as np
import numpy as np
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

Downloading config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [2]:
import random
import numpy as np

In [3]:
class BigramLM:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.word_idx = {}
        for i in range(self.vocab_size):
            self.word_idx[self.vocab[i]] = i
        self.bigram_counts = np.zeros((self.vocab_size, self.vocab_size))
        self.unigram_counts = np.zeros(self.vocab_size)
        self.unigram_prob = np.zeros(self.vocab_size)
        self.bigram_prob = np.zeros((self.vocab_size, self.vocab_size))
        self.emotion_matrix = np.zeros((6,self.vocab_size,self.vocab_size))
        self.unigram_emotions = np.zeros((6,self.vocab_size))
        self.emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
        self.stop_words = []

    def calc_emotion(self):
        for i in tqdm(range(self.vocab_size)):
            scores = emotion_scores(self.vocab[i])
            for j in range(6):
                self.unigram_emotions[j][i] = scores[j]['score']

    def learn_from_dataset(self, dataset):
        for sentence in dataset:
            for word in sentence:
                word_idx = self.word_idx[word]
                self.unigram_counts[word_idx] += 1

            for i in range(1, len(sentence)):
                prev_word_idx = self.word_idx[sentence[i-1]]
                curr_word_idx = self.word_idx[sentence[i]]
                
                self.bigram_counts[prev_word_idx, curr_word_idx] += 1

        self.unigram_prob = self.unigram_counts/np.sum(self.unigram_counts)
        self.calc_emotion()

        #generating list of stop words
        for i in dataset:
            self.stop_words.append(i[-1])
        freq_word = {}
        for word in self.stop_words:
            if word not in freq_word:
                freq_word[word] = 1
            else:
                freq_word[word] += 1

        
        sorted_dict = sorted(freq_word.items(), key=lambda item: item[1], reverse=True)[:100]
        top_words = [item[0] for item in sorted_dict]
        self.stop_words = top_words

        for i in range(len(self.bigram_counts)):
            self.bigram_prob[i] = self.bigram_counts[i] / self.unigram_counts[i]
        
        for i in range(6):
            for j in tqdm(range(self.vocab_size)):
                for k in range(self.vocab_size):
                    self.emotion_matrix[i][j][k] = self.bigram_prob[j][k] + self.unigram_emotions[i][k]
                self.emotion_matrix[i][j] /= np.sum(self.emotion_matrix[i][j])

        return self.emotion_matrix
    
    def laplace_smoothing(self,debug=False):
        smoothed_probabilities = np.zeros((self.vocab_size, self.vocab_size))

        for i in range(len(self.bigram_counts)):
            smoothed_probabilities[i] = (self.bigram_counts[i] + 1) / (self.unigram_counts[i] + self.vocab_size)

        if(debug):
            print(smoothed_probabilities)
        return smoothed_probabilities

    def kneser_ney_smoothing(self, discount=0.75, debug=True):
    
        # Initialize the KN smoothed probability matrix
        kneser_ney_prob = np.zeros((self.vocab_size, self.vocab_size))

        # Calculate total number of bigrams
        total_bigrams = np.sum(self.bigram_counts > 0)

        # Calculate continuation probabilities
        continuation_prob = np.zeros(self.vocab_size)
        for word_idx in range(self.vocab_size):
            continuation_prob[word_idx] = len(np.where(self.bigram_counts[:, word_idx] > 0)[0]) / total_bigrams

        # Calculate lambda for each word
        lambda_ = np.zeros(self.vocab_size)

        for prev_word_idx in range(self.vocab_size):
            # The number of word types that can follow w_(i-1)
            continuation_types = len(np.where(self.bigram_counts[prev_word_idx, :] > 0)[0])
            bigram_count_sum = np.sum(self.bigram_counts[prev_word_idx, :])
            if bigram_count_sum > 0:
                lambda_[prev_word_idx] = (discount * continuation_types) / bigram_count_sum
            else:
                lambda_[prev_word_idx] = 0  # Avoid division by zero if there are no following words

        # Calculate adjusted probabilities for each bigram
        for i in range(self.vocab_size):
            for j in range(self.vocab_size):
                if self.unigram_counts[i] > 0:
                    # Apply the discount and divide by the unigram count
                    kneser_ney_prob[i, j] = max(self.bigram_counts[i, j] - discount, 0) / self.unigram_counts[i]
                    
                    # Add the lambda times the continuation probability for the next word
                    kneser_ney_prob[i, j] += lambda_[i] * continuation_prob[j]

        # Update the class property with the new probabilities
        self.bigram_prob = kneser_ney_prob

        return kneser_ney_prob  
    
    
    def generate_next_word(self, current_word, emotion,k = 4):
        if current_word not in self.vocab:
            raise ValueError("Word not in vocabulary")
        
        if emotion not in self.emotions:
            raise ValueError("Invalid emotion")
        
        current_word_idx = self.word_idx[current_word]
        emotion_idx = self.emotions.index(emotion)
        next_word_probs = self.emotion_matrix[emotion_idx][current_word_idx]

        sorted_indices = np.argsort(next_word_probs)
        topk = sorted_indices[-k:]
        next_word_idx = np.random.choice(topk)
        return self.vocab[next_word_idx]
    
    def generate_sentence(self, initial_word, emotion, length = 6, k=4):
        sentence = [initial_word]
        while(length):
            next = self.generate_next_word(sentence[-1],emotion=emotion,k=k)
            sentence.append(next)
            if next in self.stop_words and len(sentence)>2:
                break
            length-=1
        return ' '.join(sentence)

file_path = 'corpus.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = []
    for line in file:
        dataset.append(line.lower().split())

flat_dataset = [word for sentence in dataset for word in sentence]

vocab = []

for sentence in dataset:
    for word in sentence:
        if word not in vocab:
            vocab.append(word)

bigram_model = BigramLM(vocab)
bigram_model.learn_from_dataset(dataset)

100%|██████████| 5429/5429 [01:32<00:00, 58.73it/s]
100%|██████████| 5429/5429 [00:15<00:00, 352.20it/s]
100%|██████████| 5429/5429 [00:15<00:00, 350.95it/s]
100%|██████████| 5429/5429 [00:15<00:00, 353.32it/s]
100%|██████████| 5429/5429 [00:14<00:00, 365.17it/s]
100%|██████████| 5429/5429 [00:15<00:00, 356.31it/s]
100%|██████████| 5429/5429 [00:15<00:00, 349.61it/s]


array([[[9.01217817e-05, 4.77863333e-05, 1.10238852e-04, ...,
         1.06249687e-04, 7.52789725e-05, 9.02315019e-05],
        [8.95600024e-05, 4.66631467e-05, 3.23000841e-04, ...,
         1.06249388e-04, 7.52787612e-05, 9.02312486e-05],
        [4.34679927e-04, 4.66738823e-05, 1.10263905e-04, ...,
         1.06273833e-04, 7.52960803e-05, 9.02520077e-05],
        ...,
        [8.95600024e-05, 4.66631467e-05, 1.10238543e-04, ...,
         1.06249388e-04, 7.52787612e-05, 9.02312486e-05],
        [8.95600024e-05, 4.66631467e-05, 1.10238543e-04, ...,
         1.06249388e-04, 7.52787612e-05, 9.02312486e-05],
        [8.95600024e-05, 4.66631467e-05, 1.10238543e-04, ...,
         1.06249388e-04, 7.52787612e-05, 9.02312486e-05]],

       [[4.43062072e-05, 5.63281391e-05, 1.57215585e-04, ...,
         3.30189130e-05, 4.35571113e-05, 9.46376114e-06],
        [4.41311630e-05, 5.59780789e-05, 2.23525008e-04, ...,
         3.30188842e-05, 4.35570731e-05, 9.46375286e-06],
        [1.51671054e-04, 

In [4]:
k = bigram_model.kneser_ney_smoothing(debug=True)
print(np.sum(k[0]))
l = bigram_model.laplace_smoothing(debug=False)
print(np.sum(l[0]))

0.9987954728191153
0.9994575829898025


In [5]:
start_words = set()
for i in dataset:
    start_words.add(i[0])
start_words = list(start_words)
print(len(start_words))

16


In [6]:
bigram_model.generate_sentence('i','surprise',length=10, k=10)

'i amazing surprising amount surprised amazing'

In [8]:
import random
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
file = open('gen.txt', 'w')
label = open('gen_labels.txt', 'w')
for emotion in emotions:
    emotion_file = open(f"gen_{emotion}.txt", "w")
    for i in range(50):
        startword = random.choice(start_words)
        x = bigram_model.generate_sentence(startword, emotion, 10, 108)
        print(emotion_scores(x))
        file.write(x+'\n')
        emotion_file.write(x+'\n')
        label.write(emotion+'\n')
    emotion_file.close()
label.close()
file.close()

[{'label': 'sadness', 'score': 0.9988045692443848}, {'label': 'joy', 'score': 0.0003073704428970814}, {'label': 'love', 'score': 0.0001781464961823076}, {'label': 'anger', 'score': 0.00032467665732838213}, {'label': 'fear', 'score': 0.00022172930766828358}, {'label': 'surprise', 'score': 0.00016358279390260577}]
[{'label': 'sadness', 'score': 0.2093583345413208}, {'label': 'joy', 'score': 0.0024985293857753277}, {'label': 'love', 'score': 0.0030538619030267}, {'label': 'anger', 'score': 0.7817633152008057}, {'label': 'fear', 'score': 0.00277901953086257}, {'label': 'surprise', 'score': 0.0005469870520755649}]
[{'label': 'sadness', 'score': 0.9976517558097839}, {'label': 'joy', 'score': 0.0004407524538692087}, {'label': 'love', 'score': 0.0006268264260143042}, {'label': 'anger', 'score': 0.00039638395537622273}, {'label': 'fear', 'score': 0.000706476450432092}, {'label': 'surprise', 'score': 0.00017779548943508416}]
[{'label': 'sadness', 'score': 0.9986181259155273}, {'label': 'joy', 's