In [3]:
import numpy as np
import numpy as np
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

2024-02-05 22:51:30.242747: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-05 22:51:30.299244: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-05 22:51:30.573934: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-05 22:51:30.574005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-05 22:51:30.630059: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [4]:
import random
import numpy as np

In [17]:
class BigramLM:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.word_idx = {}
        for i in range(self.vocab_size):
            self.word_idx[self.vocab[i]] = i
        self.bigram_counts = np.zeros((self.vocab_size, self.vocab_size))
        self.unigram_counts = np.zeros(self.vocab_size)
        self.unigram_prob = np.zeros(self.vocab_size)
        self.bigram_prob = np.zeros((self.vocab_size, self.vocab_size))
        self.emotion_matrix = np.zeros((6,self.vocab_size,self.vocab_size))
        self.unigram_emotions = np.zeros((6,self.vocab_size))
        self.emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
        self.stop_words = []

    def top_bigrams(self, probability_matrix):
        # Flatten the matrix and sort by probability
        flat_probs = probability_matrix.flatten()
        top_indices = np.argsort(flat_probs)[::-1][:5]  # Get indices of top 5 probabilities
        top_probs = flat_probs[top_indices]
        
        # Convert flat indices back to bigram indices
        top_bigrams = [(self.vocab[i // self.vocab_size], self.vocab[i % self.vocab_size])
                       for i in top_indices]
        return top_bigrams, top_probs

    def calc_emotion(self):
        for i in tqdm(range(self.vocab_size)):
            scores = emotion_scores(self.vocab[i])
            for j in range(6):
                self.unigram_emotions[j][i] = scores[j]['score']

    def learn_from_dataset(self, dataset):
        for sentence in dataset:
            for word in sentence:
                word_idx = self.word_idx[word]
                self.unigram_counts[word_idx] += 1

            for i in range(1, len(sentence)):
                prev_word_idx = self.word_idx[sentence[i-1]]
                curr_word_idx = self.word_idx[sentence[i]]
                
                self.bigram_counts[prev_word_idx, curr_word_idx] += 1

        self.unigram_prob = self.unigram_counts/np.sum(self.unigram_counts)
        self.calc_emotion()

        #generating list of stop words
        for i in dataset:
            self.stop_words.append(i[-1])
        freq_word = {}
        for word in self.stop_words:
            if word not in freq_word:
                freq_word[word] = 1
            else:
                freq_word[word] += 1

        
        sorted_dict = sorted(freq_word.items(), key=lambda item: item[1], reverse=True)[:100]
        top_words = [item[0] for item in sorted_dict]
        self.stop_words = top_words

        for i in range(len(self.bigram_counts)):
            self.bigram_prob[i] = self.bigram_counts[i] / self.unigram_counts[i]
        
        for i in range(6):
            for j in tqdm(range(self.vocab_size)):
                for k in range(self.vocab_size):
                    self.emotion_matrix[i][j][k] = self.bigram_prob[j][k] + self.unigram_emotions[i][k]
                self.emotion_matrix[i][j] /= np.sum(self.emotion_matrix[i][j])

        return self.emotion_matrix
    
    def laplace_smoothing(self,debug=False):
        smoothed_probabilities = np.zeros((self.vocab_size, self.vocab_size))

        for i in range(len(self.bigram_counts)):
            smoothed_probabilities[i] = (self.bigram_counts[i] + 1) / (self.unigram_counts[i] + self.vocab_size)

        if(debug):
            print(smoothed_probabilities)
        return smoothed_probabilities

    def kneser_ney_smoothing(self, discount=0.75, debug=True):
    
        # Initialize the KN smoothed probability matrix
        kneser_ney_prob = np.zeros((self.vocab_size, self.vocab_size))

        # Calculate total number of bigrams
        total_bigrams = np.sum(self.bigram_counts > 0)

        # Calculate continuation probabilities
        continuation_prob = np.zeros(self.vocab_size)
        for word_idx in range(self.vocab_size):
            continuation_prob[word_idx] = len(np.where(self.bigram_counts[:, word_idx] > 0)[0]) / total_bigrams

        # Calculate lambda for each word
        lambda_ = np.zeros(self.vocab_size)

        for prev_word_idx in range(self.vocab_size):
            # The number of word types that can follow w_(i-1)
            continuation_types = len(np.where(self.bigram_counts[prev_word_idx, :] > 0)[0])
            bigram_count_sum = np.sum(self.bigram_counts[prev_word_idx, :])
            if bigram_count_sum > 0:
                lambda_[prev_word_idx] = (discount * continuation_types) / bigram_count_sum
            else:
                lambda_[prev_word_idx] = 0  # Avoid division by zero if there are no following words

        # Calculate adjusted probabilities for each bigram
        for i in range(self.vocab_size):
            for j in range(self.vocab_size):
                if self.unigram_counts[i] > 0:
                    # Apply the discount and divide by the unigram count
                    kneser_ney_prob[i, j] = max(self.bigram_counts[i, j] - discount, 0) / self.unigram_counts[i]
                    
                    # Add the lambda times the continuation probability for the next word
                    kneser_ney_prob[i, j] += lambda_[i] * continuation_prob[j]

        # Update the class property with the new probabilities
        self.bigram_prob = kneser_ney_prob

        return kneser_ney_prob  
    
    
    def generate_next_word(self, current_word, emotion,k = 4):
        if current_word not in self.vocab:
            raise ValueError("Word not in vocabulary")
        
        if emotion not in self.emotions:
            raise ValueError("Invalid emotion")
        
        current_word_idx = self.word_idx[current_word]
        emotion_idx = self.emotions.index(emotion)
        next_word_probs = self.emotion_matrix[emotion_idx][current_word_idx]

        sorted_indices = np.argsort(next_word_probs)
        topk = sorted_indices[-k:]
        next_word_idx = np.random.choice(topk)
        return self.vocab[next_word_idx]
    
    def generate_sentence(self, initial_word, emotion, length = 6, k=4):
        sentence = [initial_word]
        while(length):
            next = self.generate_next_word(sentence[-1],emotion=emotion,k=k)
            sentence.append(next)
            if next in self.stop_words and len(sentence)>2:
                break
            length-=1
        return ' '.join(sentence)

file_path = 'corpus.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = []
    for line in file:
        dataset.append(line.lower().split())

flat_dataset = [word for sentence in dataset for word in sentence]

vocab = []

for sentence in dataset:
    for word in sentence:
        if word not in vocab:
            vocab.append(word)

bigram_model = BigramLM(vocab)
bigram_model.learn_from_dataset(dataset)

100%|██████████| 5429/5429 [01:31<00:00, 59.62it/s]
100%|██████████| 5429/5429 [00:19<00:00, 274.40it/s]
100%|██████████| 5429/5429 [00:20<00:00, 262.63it/s]
100%|██████████| 5429/5429 [00:23<00:00, 234.64it/s]
100%|██████████| 5429/5429 [00:23<00:00, 230.70it/s]
100%|██████████| 5429/5429 [00:24<00:00, 219.36it/s]
100%|██████████| 5429/5429 [00:23<00:00, 233.59it/s]


array([[[9.01219461e-05, 4.77863917e-05, 1.10238787e-04, ...,
         1.06249795e-04, 7.52790104e-05, 9.02314998e-05],
        [8.95601668e-05, 4.66632050e-05, 3.23000770e-04, ...,
         1.06249497e-04, 7.52787991e-05, 9.02312465e-05],
        [4.34680083e-04, 4.66739407e-05, 1.10263839e-04, ...,
         1.06273941e-04, 7.52961182e-05, 9.02520056e-05],
        ...,
        [8.95601668e-05, 4.66632050e-05, 1.10238477e-04, ...,
         1.06249497e-04, 7.52787991e-05, 9.02312465e-05],
        [8.95601668e-05, 4.66632050e-05, 1.10238477e-04, ...,
         1.06249497e-04, 7.52787991e-05, 9.02312465e-05],
        [8.95601668e-05, 4.66632050e-05, 1.10238477e-04, ...,
         1.06249497e-04, 7.52787991e-05, 9.02312465e-05]],

       [[4.43063252e-05, 5.63282125e-05, 1.57215494e-04, ...,
         3.30189472e-05, 4.35571107e-05, 9.46374619e-06],
        [4.41312810e-05, 5.59781523e-05, 2.23524916e-04, ...,
         3.30189183e-05, 4.35570726e-05, 9.46373791e-06],
        [1.51671170e-04, 

In [6]:
k = bigram_model.kneser_ney_smoothing(debug=True)
print(np.sum(k[0]))
l = bigram_model.laplace_smoothing(debug=False)
print(np.sum(l[0]))

0.9987954728191153
0.9994575829898025


In [7]:
start_words = set()
for i in dataset:
    start_words.add(i[0])
start_words = list(start_words)
print(len(start_words))

16


In [8]:
bigram_model.generate_sentence('i','surprise',length=10, k=10)

'i enthralled shocked'

In [10]:
import random
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
file = open('gen.txt', 'w')
label = open('gen_labels.txt', 'w')
for emotion in emotions:
    emotion_file = open(f"gen_{emotion}.txt", "w")
    for i in range(50):
        startword = random.choice(start_words)
        x = bigram_model.generate_sentence(startword, emotion, 10, 108)
        # print(emotion_scores(x))
        file.write(x+'\n')
        emotion_file.write(x+'\n')
        label.write(emotion+'\n')
    emotion_file.close()
label.close()
file.close()

In [15]:
bigram_model.bigram_prob-bigram_model.laplace_smoothing()

array([[ 1.34794585e-03,  3.30032321e-05,  5.72095524e-06, ...,
        -1.04914516e-04, -1.04914516e-04, -1.04914516e-04],
       [ 7.55063277e-03, -3.65337058e-05,  2.52215798e-02, ...,
        -1.65441874e-04, -1.65441874e-04, -1.65441874e-04],
       [ 1.49595757e-01, -1.18156411e-05,  5.01584856e-04, ...,
        -1.61557453e-04, -1.61557453e-04, -1.61557453e-04],
       ...,
       [ 1.27066548e-02,  6.13773055e-05,  7.97995410e-04, ...,
        -1.53469642e-04, -1.53469642e-04, -1.53469642e-04],
       [ 1.27066548e-02,  6.13773055e-05,  7.97995410e-04, ...,
        -1.53469642e-04, -1.53469642e-04, -1.53469642e-04],
       [ 1.27066548e-02,  6.13773055e-05,  7.97995410e-04, ...,
        -1.53469642e-04, -1.53469642e-04, -1.53469642e-04]])

In [19]:


# Raw probabilities
raw_probabilities = bigram_model.bigram_counts / bigram_model.unigram_counts[:, None]
raw_top_bigrams, raw_top_probs = bigram_model.top_bigrams(raw_probabilities)

# Laplace probabilities
laplace_prob = bigram_model.laplace_smoothing()
laplace_top_bigrams, laplace_top_probs = bigram_model.top_bigrams(laplace_prob)

# Kneser-Ney probabilities
kneser_ney_prob = bigram_model.kneser_ney_smoothing(discount=0.75)
kneser_ney_top_bigrams, kneser_ney_top_probs = bigram_model.top_bigrams(kneser_ney_prob)

# Print top 5 bigrams before smoothing
print("Top 5 bigrams before smoothing:")
for bigram, prob in zip(raw_top_bigrams, raw_top_probs):
    print(f"{bigram}: {prob}")

# Print top 5 bigrams after Laplace smoothing
print("\nTop 5 bigrams after Laplace smoothing:")
for bigram, prob in zip(laplace_top_bigrams, laplace_top_probs):
    print(f"{bigram}: {prob}")

# Print top 5 bigrams after Kneser-Ney smoothing
print("\nTop 5 bigrams after Kneser-Ney smoothing:")
for bigram, prob in zip(kneser_ney_top_bigrams, kneser_ney_top_probs):
    print(f"{bigram}: {prob}")

Top 5 bigrams before smoothing:
('kg', 'for'): 1.0
('slopes', 'thats'): 1.0
('gods', 'plan'): 1.0
('dust', 'to'): 1.0
('uw', 'school'): 1.0

Top 5 bigrams after Laplace smoothing:
('i', 'feel'): 0.11043610327619874
('feel', 'like'): 0.0350976507217662
('i', 'am'): 0.03189412019960946
('that', 'i'): 0.02650602409638554
('and', 'i'): 0.023103748910200523

Top 5 bigrams after Kneser-Ney smoothing:
('don', 't'): 0.9703597914718767
('href', 'http'): 0.9700024553936815
('didn', 't'): 0.9583674360233536
('sort', 'of'): 0.956541337274802
('supposed', 'to'): 0.9184706989687346
