## Import Libraries

In [517]:
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    CountVectorizer,
)
from sklearn.preprocessing import Normalizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import re
import itertools
from langdetect import detect
import langdetect
from collections import Counter
import math
import random
import nltk
from nltk import FreqDist
from nltk.corpus import reuters
from nltk.util import bigrams
from nltk.probability import ConditionalFreqDist
from nltk import bigrams, FreqDist
from nltk.probability import LidstoneProbDist
from nltk.tokenize import word_tokenize
import numpy as np
import demoji


## Data Preprocessing

load the data

In [518]:
data = pd.read_csv("./../Q1/data.csv")

In [519]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1157 non-null   object
 1   id                  1157 non-null   int64 
 2   text                1156 non-null   object
 3   ownerUsername       1157 non-null   object
 4   ownerProfilePicUrl  1157 non-null   object
 5   timestamp           1157 non-null   object
 6   likesCount          1157 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 63.4+ KB


In [520]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cz67N84Pezn/,17981320055390052,,n4i1er,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T01:46:41.000Z,0
1,https://www.instagram.com/p/Cz67N84Pezn/,18225095332247021,😍😍🔥🔥🔥,farid.zand1997,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T09:30:36.000Z,0
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T12:32:20.000Z,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:14:43.000Z,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid 💩🤣🖕,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:15:08.000Z,0


Remove duplicate comments

In [521]:
data = data.drop_duplicates(subset=["text"])

Remove comments with empty text

In [522]:
data = data[data['text'].notna()]

Remove text that is not related to English

In [523]:
nlp = spacy.load("en_core_web_sm")

In [524]:
data = data[
    data["text"].apply(lambda x: all([token.pos_ != "FOREIGN" for token in nlp(x)]))
    == True
]

Convert timestamp to a standard format

In [525]:
data["timestamp"] = pd.to_datetime(data["timestamp"])

Remove non-ASCII characters

In [526]:
data["text"] = data["text"].apply(lambda x: "".join([ch for ch in x if ord(ch) < 128]))

Convert text to lowercase

In [527]:
data["text"] = data["text"].str.lower()

Remove punctuation

In [528]:
data["text"] = data["text"].str.replace("[^\w\s]", "")

Remove stop words

In [529]:
stop_words = set(stopwords.words("english"))
data["text"] = data["text"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

In [530]:
data = data[data["text"].str.strip() != ""]

In [531]:
def tokenize_the_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [532]:
preprocessed_data = data["text"].apply(tokenize_the_text).tolist()


In [533]:
total_tokens = [token for entry_tokens in preprocessed_data for token in entry_tokens]

In [534]:
vocab_size = len(set(total_tokens))

## perplexity

In [535]:
def calculate_perplexity(model, n, dataset, alpha=1):
    probabilities = [model(*dataset[i-n+1:i+1], dataset[:i], alpha) for i in range(n-1, len(dataset))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

## Unigram Language Model

### Introduction 

A unigram language model is a simple yet powerful statistical model used in natural language processing (NLP) to predict the probability of occurrence of a word in a sequence of words. It assumes that the probability of a word appearing in a sentence is independent of the context in which it appears. This assumption, while not entirely accurate, often provides a reasonable approximation of the true word distribution in a language.

### Implementation

In [536]:
def unigramModel(token, dataset, alpha=1):
    token_count = Counter(dataset)
    return (token_count[token] + alpha) / (len(dataset) + alpha * vocab_size)

## bigram Language Model

### Introduction

A bigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the word that precedes it. Unlike unigram models, which assume that words occur independently of each other, bigram models take into account the sequential nature of language.

### Implementation

In [537]:
def bigramModel(token1, token2, dataset, alpha=1):
    token_count = Counter(dataset)
    bigram_count = Counter(zip(dataset, dataset[1:]))
    return (bigram_count[(token1, token2)] + alpha) / (token_count[token1] + alpha * vocab_size)

## trigram Language Model

### Introduction

A trigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the two preceding words. It takes into account the sequential nature of language by considering the dependencies between three consecutive words.

### Implementation

In [538]:
def trigramModel(token1, token2, token3, dataset, alpha=1):
    token_count = Counter(dataset)
    trigram_count = Counter(zip(dataset, dataset[1:], dataset[2:]))
    bigram_count = Counter(zip(dataset, dataset[1:]))
    return (trigram_count[(token1, token2, token3)] + alpha) / (bigram_count[(token1, token2)] + alpha * vocab_size)

calculate perplexity base on model

In [539]:
def calculate_perplexity(model, n, dataset, alpha=1):
    probabilities = [model(*dataset[i-n+1:i+1], dataset[:i], alpha) for i in range(n-1, len(dataset))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

generate sentence base on the model

In [540]:
def generate_sentence(model, n, dataset, length=10):
    sentence = []
    for _ in range(length):
        if n == 1:
            next_token = random.choice(dataset)
        else:
            prev_tokens = sentence[-n+1:] if n > 1 else []
            candidates = [token for token in set(dataset) if model(*(prev_tokens + [token]), dataset) > 0]
            next_token = random.choice(candidates)
        sentence.append(next_token)
    return ' '.join(sentence)

calculate sentence perplexity base on the model

In [541]:
def calculate_sentence_perplexity(model, n, sentence, dataset, alpha=1):
    tokens = tokenize_the_text(sentence)
    probabilities = [model(*tokens[i-n+1:i+1], dataset, alpha) for i in range(n-1, len(tokens))]
    perplexity = np.exp(-np.mean(np.log(probabilities)))
    return perplexity

testing part

In [557]:
def generate_and_report(model, n, dataset, alpha=1, num_sentences=5):
    print(f"Generating and evaluating {num_sentences} sentences for {n}-gram model:")
    for i in range(num_sentences):
        generated_sentence = generate_sentence(model, n, dataset)
        perplexity = calculate_sentence_perplexity(model, n, generated_sentence, dataset, alpha)
        print(f"Generated Sentence {i+1}: {generated_sentence}")
        print(f"Perplexity: {perplexity}")
        print()


unigram model sentence generation

In [559]:
generate_and_report(unigramModel, 1, total_tokens)

Generating and evaluating 5 sentences for 1-gram model:
Generated Sentence 1: groceries mi coming gym lplucasdiasoficial lagta done llama taking https
Perplexity: 1791.447914483821

Generated Sentence 2: first quiser natural umas para awesome creatino like saying para
Perplexity: 1017.2669044640194

Generated Sentence 3: viratkohli win saying potencial attractive grvido peitudo indian implants forc
Perplexity: 2339.0550062222874

Generated Sentence 4: kkkk pra kenniemalm place water rara vegan dont lembra compared
Perplexity: 1335.419213049373

Generated Sentence 5: pasa looking cara tenha tem goes de see una ser
Perplexity: 830.5323327830434



bigram model sentence generation

In [560]:
generate_and_report(bigramModel, 2, total_tokens)

Generating and evaluating 5 sentences for 2-gram model:


TypeError: bigramModel() missing 1 required positional argument: 'dataset'

trigram model sentence generation

In [546]:
# generate_and_report(trigramModel, 3, total_tokens)

Generating and evaluating 5 sentences for 3-gram model:


TypeError: trigramModel() missing 2 required positional arguments: 'token3' and 'dataset'