## Import Libraries

In [701]:
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    CountVectorizer,
)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import spacy
import re
import math
import random
from nltk import FreqDist
from nltk.util import bigrams
from nltk import bigrams, FreqDist
import numpy as np


## Data Preprocessing

load the data

In [702]:
data = pd.read_csv("./../Q1/data.csv")

In [703]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1157 non-null   object
 1   id                  1157 non-null   int64 
 2   text                1156 non-null   object
 3   ownerUsername       1157 non-null   object
 4   ownerProfilePicUrl  1157 non-null   object
 5   timestamp           1157 non-null   object
 6   likesCount          1157 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 63.4+ KB


In [704]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cz67N84Pezn/,17981320055390052,,n4i1er,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T01:46:41.000Z,0
1,https://www.instagram.com/p/Cz67N84Pezn/,18225095332247021,😍😍🔥🔥🔥,farid.zand1997,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T09:30:36.000Z,0
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T12:32:20.000Z,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:14:43.000Z,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid 💩🤣🖕,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:15:08.000Z,0


Remove duplicate comments

In [705]:
data = data.drop_duplicates(subset=["text"])

Remove comments with empty text

In [706]:
data = data[data['text'].notna()]

Remove text that is not related to English

In [707]:
nlp = spacy.load("en_core_web_sm")

In [708]:
data = data[
    data["text"].apply(lambda x: all([token.pos_ != "FOREIGN" for token in nlp(x)]))
    == True
]

Convert timestamp to a standard format

In [709]:
data["timestamp"] = pd.to_datetime(data["timestamp"])

Remove non-ASCII characters

In [710]:
data["text"] = data["text"].apply(lambda x: "".join([ch for ch in x if ord(ch) < 128]))

Convert text to lowercase

In [711]:
data["text"] = data["text"].str.lower()

Remove punctuation

In [712]:
data["text"] = data["text"].str.replace("[^\w\s]", "")

Remove stop words

In [713]:
stop_words = set(stopwords.words("english"))
data["text"] = data["text"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

In [714]:
data = data[data["text"].str.strip() != ""]

Remove rows where the length of text is not greater than 1

In [715]:
data = data[data['text'].apply(lambda x: len(str(x)) > 1)]

In [716]:
def tokenize_the_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [717]:
corpus = [word for sublist in preprocessed_data for word in sublist]


## Unigram Language Model

### Introduction 

A unigram language model is a simple yet powerful statistical model used in natural language processing (NLP) to predict the probability of occurrence of a word in a sequence of words. It assumes that the probability of a word appearing in a sentence is independent of the context in which it appears. This assumption, while not entirely accurate, often provides a reasonable approximation of the true word distribution in a language.

### Implementation

In [718]:
import random
from collections import Counter
import math

class UnigramModel:
    def __init__(self, smoothing=True):
        self.word_counts = Counter()
        self.total_words = 0
        self.vocab_size = 0
        self.smoothing = smoothing

    def train(self, corpus):
        # Count the frequency of each word in the corpus
        self.word_counts = Counter(corpus)
        self.total_words = len(corpus)
        self.vocab_size = len(set(corpus))

    def generate_sentence(self, length=10):
        # Generate a random sentence based on the unigram model
        sentence = [random.choice(list(self.word_counts.keys())) for _ in range(length)]
        return ' '.join(sentence)

    def probability(self, word):
        # Calculate the probability of a word based on the unigram model with smoothing
        if self.total_words == 0:
            return 0

        if self.smoothing:
            return (self.word_counts[word] + 1) / (self.total_words + self.vocab_size)
        else:
            return self.word_counts[word] / self.total_words

    def perplexity(self, sentence):
        # Calculate perplexity of a sentence
        words = sentence.split()
        word_probabilities = [self.probability(word) for word in words]
        perplexity = math.exp(-sum(math.log(prob, 2) for prob in word_probabilities) / len(words))
        return perplexity

In [719]:
model = UnigramModel(smoothing=True)
model.train(corpus)

 Generate 5 sentences and calculate perplexity
	

In [720]:
for _ in range(5):
    generated_sentence = model.generate_sentence()
    perplexity = model.perplexity(generated_sentence)
    print(f"Generated Sentence: {generated_sentence}.\nPerplexity: {perplexity:.2f}")

Generated Sentence: bring full te gene deixem pasa gratulations zz na vahid.
Perplexity: 55689.98
Generated Sentence: marinhosjulio sin leostronda glen title international convidas mesi happiest thanks.
Perplexity: 106200.94
Generated Sentence: fucking parece veteran fogo thanks multiverse monstra levanta rico vida.
Perplexity: 74899.53
Generated Sentence: permitir dirty fdss frontal imo think qualquer engordaria dica mm.
Perplexity: 100166.79
Generated Sentence: respect children sound vou fome bonito hip 2am easy zinda.
Perplexity: 87763.34


## bigram Language Model

### Introduction

A bigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the word that precedes it. Unlike unigram models, which assume that words occur independently of each other, bigram models take into account the sequential nature of language.

### Implementation

In [721]:
class BigramModel:
    def __init__(self, smoothing=True):
        self.bigram_counts = Counter()
        self.unigram_counts = Counter()
        self.smoothing = smoothing

    def train(self, corpus):
        # Tokenize and count unigrams and bigrams in the corpus
        tokenized_corpus = [word_tokenize(' '.join(sentence)) for sentence in corpus]
        self.unigram_counts = Counter([word for sentence in tokenized_corpus for word in sentence])
        self.bigram_counts = Counter(bigrams([word for sentence in tokenized_corpus for word in sentence]))

    def generate_sentence(self, length=10, start_word=None):
        # Generate a random sentence based on the bigram model
        sentence = []
        current_word = start_word if start_word else random.choice(list(self.unigram_counts.keys()))

        for _ in range(length):
            sentence.append(current_word)
            next_word_candidates = [word for word in self.unigram_counts.keys() if (current_word, word) in self.bigram_counts]
            if next_word_candidates:
                current_word = random.choice(next_word_candidates)
            else:
                break

        return ' '.join(sentence)

    def probability(self, word, previous_word=None):
        # Calculate the probability of a word based on the bigram model with smoothing
        if self.unigram_counts[previous_word] == 0:
            return 0

        if self.smoothing:
            bigram_count = self.bigram_counts[(previous_word, word)]
            unigram_count = self.unigram_counts[previous_word]
            return (bigram_count + 1) / (unigram_count + len(self.unigram_counts))
        else:
            if (previous_word, word) in self.bigram_counts:
                return self.bigram_counts[(previous_word, word)] / self.unigram_counts[previous_word]
            else:
                return 0

    def perplexity(self, sentence):
        # Calculate perplexity of a sentence
        words = word_tokenize(sentence)
        word_probabilities = [self.probability(words[i], words[i - 1]) for i in range(1, len(words))]
        word_probabilities = [prob if prob > 0 else 1e-10 for prob in word_probabilities]  # Avoid zero probabilities
        perplexity = math.exp(-sum(math.log(prob, 2) for prob in word_probabilities) / (len(words) - 1)) if sum(word_probabilities) > 0 else float('inf')
        return perplexity

In [722]:
# Bigram model
bigram_model = BigramModel(smoothing=True)
bigram_model.train(data["text"])

# Generate and print sentences along with perplexity for both models
for _ in range(5):
    generated_bigram_sentence = bigram_model.generate_sentence(length=8, start_word="I")
    perplexity_bigram = bigram_model.perplexity(generated_bigram_sentence)
    print("Bigram Generated Sentence:", generated_bigram_sentence)
    print("Bigram Perplexity:", perplexity_bigram)
    print("---")

Bigram Generated Sentence: I
Bigram Perplexity: inf
---
Bigram Generated Sentence: I
Bigram Perplexity: inf
---
Bigram Generated Sentence: I
Bigram Perplexity: inf
---
Bigram Generated Sentence: I
Bigram Perplexity: inf
---
Bigram Generated Sentence: I
Bigram Perplexity: inf
---


## trigram Language Model

### Introduction

A trigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the two preceding words. It takes into account the sequential nature of language by considering the dependencies between three consecutive words.

### Implementation

In [723]:
def trigramModel(token1, token2, token3, dataset, alpha=1):
    token_count = Counter(dataset)
    trigram_count = Counter(zip(dataset, dataset[1:], dataset[2:]))
    bigram_count = Counter(zip(dataset, dataset[1:]))
    return (trigram_count[(token1, token2, token3)] + alpha) / (bigram_count[(token1, token2)] + alpha * vocab_size)

calculate perplexity base on model

In [724]:
def calculate_perplexity(model, n, dataset, alpha=1):
    probabilities = [model(*dataset[i-n+1:i+1], dataset[:i], alpha) for i in range(n-1, len(dataset))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

generate sentence base on the model

In [725]:
def generate_sentence(model, n, dataset, length=10):
    sentence = []
    for _ in range(length):
        if n == 1:
            next_token = random.choice(dataset)
        else:
            prev_tokens = sentence[-n+1:] if n > 1 else []
            candidates = [token for token in set(dataset) if model(*(prev_tokens + [token]), dataset) > 0]
            next_token = random.choice(candidates)
        sentence.append(next_token)
    return ' '.join(sentence)

calculate sentence perplexity base on the model

In [726]:
def calculate_sentence_perplexity(model, n, sentence, dataset, alpha=1):
    tokens = tokenize_the_text(sentence)
    probabilities = [model(*tokens[i-n+1:i+1], dataset, alpha) for i in range(n-1, len(tokens))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

testing part

In [727]:
def generate_and_report(model, n, dataset, alpha=1, num_sentences=5):
    print(f"Generating and evaluating {num_sentences} sentences for {n}-gram model:")
    for i in range(num_sentences):
        generated_sentence = generate_sentence(model, n, dataset)
        perplexity = calculate_sentence_perplexity(model, n, generated_sentence, dataset, alpha)
        print(f"Generated Sentence {i+1}: {generated_sentence}")
        print(f"Perplexity: {perplexity}")
        print()


unigram model sentence generation

In [728]:
generate_and_report(unigramModel, 1, total_tokens)

Generating and evaluating 5 sentences for 1-gram model:
Generated Sentence 1: quase cbum homem terrified digo faa fazer cbum eat cbum
Perplexity: 823.8398049227533

Generated Sentence 2: vou um aqui ya stage cbum pradocamanda ta meto kenniemalm
Perplexity: 1160.4304994252616

Generated Sentence 3: fair long shape engaged um competir esqueciiiiiii elvis lives bumesteeeeeeeed
Perplexity: 1557.8236700403875

Generated Sentence 4: time se 5 nos expressar el enough perfeita welt todos
Perplexity: 1194.433395457165

Generated Sentence 5: tanta youre quiser went please esse pois timo mas fans
Perplexity: 1299.4920094390195



bigram model sentence generation

In [729]:
generate_and_report(bigramModel, 2, total_tokens)

Generating and evaluating 5 sentences for 2-gram model:


TypeError: bigramModel() missing 1 required positional argument: 'dataset'

trigram model sentence generation

In [None]:
# generate_and_report(trigramModel, 3, total_tokens)