## Import Libraries

In [59]:

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import spacy
import math
import random
from nltk import FreqDist
from nltk.util import bigrams
from nltk import bigrams, FreqDist
import numpy as np


## Data Preprocessing

load the data

In [60]:
data = pd.read_csv("./../Q1/data.csv")

In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1157 non-null   object
 1   id                  1157 non-null   int64 
 2   text                1156 non-null   object
 3   ownerUsername       1157 non-null   object
 4   ownerProfilePicUrl  1157 non-null   object
 5   timestamp           1157 non-null   object
 6   likesCount          1157 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 63.4+ KB


In [62]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cz67N84Pezn/,17981320055390052,,n4i1er,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T01:46:41.000Z,0
1,https://www.instagram.com/p/Cz67N84Pezn/,18225095332247021,😍😍🔥🔥🔥,farid.zand1997,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T09:30:36.000Z,0
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T12:32:20.000Z,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:14:43.000Z,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid 💩🤣🖕,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:15:08.000Z,0


Remove duplicate comments

In [63]:
data = data.drop_duplicates(subset=["text"])

Remove comments with empty text

In [64]:
data = data[data['text'].notna()]

Remove text that is not related to English

In [65]:
nlp = spacy.load("en_core_web_sm")

In [66]:
data = data[
    data["text"].apply(lambda x: all([token.pos_ != "FOREIGN" for token in nlp(x)]))
    == True
]

Convert timestamp to a standard format

In [67]:
data["timestamp"] = pd.to_datetime(data["timestamp"])

Remove non-ASCII characters

In [68]:
data["text"] = data["text"].apply(lambda x: "".join([ch for ch in x if ord(ch) < 128]))

Convert text to lowercase

In [69]:
data["text"] = data["text"].str.lower()

Remove punctuation

In [70]:
data["text"] = data["text"].str.replace("[^\w\s]", "")

Remove stop words

In [71]:
stop_words = set(stopwords.words("english"))
data["text"] = data["text"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

In [72]:
data = data[data["text"].str.strip() != ""]

Remove rows where the length of text is not greater than 1

In [73]:
data = data[data['text'].apply(lambda x: len(str(x)) > 1)]

In [74]:
data

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09 12:32:20+00:00,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 !,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09 13:14:43+00:00,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09 13:15:08+00:00,0
5,https://www.instagram.com/p/Cz67N84Pezn/,18118091365334462,nice arms,st3ph_3s0n,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09 20:37:29+00:00,0
7,https://www.instagram.com/p/Cz67N84Pezn/,18247697080171707,gostoso,pedr0_hgc,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-10 01:28:48+00:00,0
...,...,...,...,...,...,...,...
1151,https://www.instagram.com/p/C17pLDogJUV/,18009715463508895,@cbum transformation lost 25 kg year @klemensh...,klemensheidorn,https://instagram.ferz2-1.fna.fbcdn.net/v/t51....,2024-01-17 12:35:51+00:00,1
1152,https://www.instagram.com/p/C17pLDogJUV/,17996526734407530,get favor,aftabkhan_editz,https://instagram.ferz2-1.fna.fbcdn.net/v/t51....,2024-01-17 12:37:10+00:00,0
1153,https://www.instagram.com/p/C17pLDogJUV/,18032966065693847,eu consigo coar costas,jpzff_11,https://instagram.ferz2-1.fna.fbcdn.net/v/t51....,2024-01-17 13:05:14+00:00,0
1154,https://www.instagram.com/p/C17pLDogJUV/,17911811417866846,@zeuswithcan sen daha iyisin,emr3c3lik,https://instagram.ferz2-1.fna.fbcdn.net/v/t51....,2024-01-17 13:26:09+00:00,0


In [75]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [76]:
corpus = [preprocess_text(entry) for entry in data['text'] if pd.notna(entry) and isinstance(entry, str)]


In [77]:
corpus = [item for sublist in corpus for item in sublist]

## Unigram Language Model

### Introduction 

A unigram language model is a simple yet powerful statistical model used in natural language processing (NLP) to predict the probability of occurrence of a word in a sequence of words. It assumes that the probability of a word appearing in a sentence is independent of the context in which it appears. This assumption, while not entirely accurate, often provides a reasonable approximation of the true word distribution in a language.

### Implementation

In [78]:
import random
from collections import Counter

class UnigramModel:
    def __init__(self, smoothing=True):
        self.word_counts = Counter()
        self.total_words = 0
        self.vocab_size = 0
        self.smoothing = smoothing

    def train(self, corpus):
        # Count the frequency of each word in the corpus
        self.word_counts = Counter(corpus)
        self.total_words = len(corpus)
        self.vocab_size = len(set(corpus))

    def generate_sentence(self, length=10):
        # Generate a random sentence based on the unigram model
        sentence = [random.choice(list(self.word_counts.keys())) for _ in range(length)]
        return ' '.join(sentence)

    def probability(self, word):
        # Calculate the probability of a word based on the unigram model with smoothing
        if self.total_words == 0:
            return 0

        if self.smoothing:
            return (self.word_counts[word] + 1) / (self.total_words + self.vocab_size)
        else:
            return self.word_counts[word] / self.total_words

    def perplexity(self, sentence):
        # Calculate perplexity of a sentence
        words = sentence.split()
        word_probabilities = [self.probability(word) for word in words]
        perplexity = math.exp(-sum(math.log(prob, 2) for prob in word_probabilities) / len(words))
        return perplexity

In [79]:
model = UnigramModel(smoothing=True)
model.train(corpus)

 Generate 5 sentences and calculate perplexity
	

In [80]:
for _ in range(5):
    generated_sentence = model.generate_sentence()
    perplexity = model.perplexity(generated_sentence)
    print(f"Generated Sentence: {generated_sentence}.\nPerplexity: {perplexity:.2f}")

Generated Sentence: bros viratkohli miles gnio brazo hacen support rara hairline mama.
Perplexity: 98348.58
Generated Sentence: stiff he involucran segurar dear mais prprio filter mi espalda.
Perplexity: 64185.04
Generated Sentence: tarining pictures attractive von lateral multiverse sensao ni hwa menos.
Perplexity: 91361.27
Generated Sentence: rodrigoamgoes ama segundo kkkk q personality hide feioooooo marido lagi.
Perplexity: 69896.42
Generated Sentence: whos it chega paga legenda sentiu tigre roids sos classic.
Perplexity: 74800.28


## bigram Language Model

### Introduction

A bigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the word that precedes it. Unlike unigram models, which assume that words occur independently of each other, bigram models take into account the sequential nature of language.

### Implementation

In [81]:
class BigramModel:
    def __init__(self, smoothing=True):
        self.bigram_counts = Counter()
        self.unigram_counts = Counter()
        self.smoothing = smoothing

	 # Tokenize and count unigrams and bigrams in the corpus
    def train(self, corpus):  
        tokenized_corpus = [word_tokenize(' '.join(sentence)) for sentence in corpus]
        self.unigram_counts = Counter([word for sentence in tokenized_corpus for word in sentence])
        self.bigram_counts = Counter(bigrams([word for sentence in tokenized_corpus for word in sentence]))

	# Generate a random sentence based on the bigram model
    def generate_sentence(self, length=10, start_word=None):
        sentence = []
        current_word = start_word if start_word else random.choice(list(self.unigram_counts.keys()))
        for _ in range(length):
            sentence.append(current_word)
            next_word_candidates = [word for word in self.unigram_counts.keys() if (current_word, word) in self.bigram_counts]
            if next_word_candidates:
                current_word = random.choice(next_word_candidates)
            else:
                break
        return ' '.join(sentence)

	# Calculate the probability of a word based on the bigram model with smoothing
    def probability(self, word, previous_word=None):
        if self.unigram_counts[previous_word] == 0:
            return 0

        if self.smoothing:
            bigram_count = self.bigram_counts[(previous_word, word)]
            unigram_count = self.unigram_counts[previous_word]
            return (bigram_count + 1) / (unigram_count + len(self.unigram_counts))
        else:
            if (previous_word, word) in self.bigram_counts:
                return self.bigram_counts[(previous_word, word)] / self.unigram_counts[previous_word]
            else:
                return 0

	# Calculate perplexity of a sentence
    def perplexity(self, sentence):
        
        words = word_tokenize(sentence)
        word_probabilities = [self.probability(words[i], words[i - 1]) for i in range(1, len(words))]
        word_probabilities = [prob if prob > 0 else 1e-10 for prob in word_probabilities]  # Avoid zero probabilities
        perplexity = math.exp(-sum(math.log(prob, 2) for prob in word_probabilities) / (len(words) - 1)) if sum(word_probabilities) > 0 else float('inf')
        return perplexity

## trigram Language Model

### Introduction

A trigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the two preceding words. It takes into account the sequential nature of language by considering the dependencies between three consecutive words.

### Implementation

In [82]:
def trigramModel(token1, token2, token3, dataset, alpha=1):
    token_count = Counter(dataset)
    trigram_count = Counter(zip(dataset, dataset[1:], dataset[2:]))
    bigram_count = Counter(zip(dataset, dataset[1:]))
    return (trigram_count[(token1, token2, token3)] + alpha) / (bigram_count[(token1, token2)] + alpha * vocab_size)

calculate perplexity base on model

In [83]:
def calculate_perplexity(model, n, dataset, alpha=1):
    probabilities = [model(*dataset[i-n+1:i+1], dataset[:i], alpha) for i in range(n-1, len(dataset))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

generate sentence base on the model

In [84]:
def generate_sentence(model, n, dataset, length=10):
    sentence = []
    for _ in range(length):
        if n == 1:
            next_token = random.choice(dataset)
        else:
            prev_tokens = sentence[-n+1:] if n > 1 else []
            candidates = [token for token in set(dataset) if model(*(prev_tokens + [token]), dataset) > 0]
            next_token = random.choice(candidates)
        sentence.append(next_token)
    return ' '.join(sentence)

calculate sentence perplexity base on the model

In [85]:
def calculate_sentence_perplexity(model, n, sentence, dataset, alpha=1):
    tokens = tokenize_the_text(sentence)
    probabilities = [model(*tokens[i-n+1:i+1], dataset, alpha) for i in range(n-1, len(tokens))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

testing part

In [86]:
def generate_and_report(model, n, dataset, alpha=1, num_sentences=5):
    print(f"Generating and evaluating {num_sentences} sentences for {n}-gram model:")
    for i in range(num_sentences):
        generated_sentence = generate_sentence(model, n, dataset)
        perplexity = calculate_sentence_perplexity(model, n, generated_sentence, dataset, alpha)
        print(f"Generated Sentence {i+1}: {generated_sentence}")
        print(f"Perplexity: {perplexity}")
        print()


unigram model sentence generation

##  pay attention : i got error in developing bigram and trigram model , so i have remove it's model's code