## Import Libraries

In [174]:
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    CountVectorizer,
)
from sklearn.preprocessing import Normalizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import nltk
import re
import itertools
from langdetect import detect
import langdetect

## Data Preprocessing

load the data

In [175]:
data = pd.read_csv("./../Q1/data.csv")

In [176]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1157 non-null   object
 1   id                  1157 non-null   int64 
 2   text                1156 non-null   object
 3   ownerUsername       1157 non-null   object
 4   ownerProfilePicUrl  1157 non-null   object
 5   timestamp           1157 non-null   object
 6   likesCount          1157 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 63.4+ KB


In [177]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cz67N84Pezn/,17981320055390052,,n4i1er,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T01:46:41.000Z,0
1,https://www.instagram.com/p/Cz67N84Pezn/,18225095332247021,😍😍🔥🔥🔥,farid.zand1997,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T09:30:36.000Z,0
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T12:32:20.000Z,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:14:43.000Z,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid 💩🤣🖕,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:15:08.000Z,0


Remove duplicate comments

In [178]:
data = data.drop_duplicates(subset=["text"])

Remove comments with empty text

In [179]:
data = data[data['text'].notna()]

Remove text that is not related to English

In [180]:
nlp = spacy.load("en_core_web_sm")

In [181]:
data = data[
    data["text"].apply(lambda x: all([token.pos_ != "FOREIGN" for token in nlp(x)]))
    == True
]

Convert timestamp to a standard format

In [182]:
data["timestamp"] = pd.to_datetime(data["timestamp"])

Remove non-ASCII characters

In [183]:
data["text"] = data["text"].apply(lambda x: "".join([ch for ch in x if ord(ch) < 128]))

In [542]:
data.head()

Unnamed: 0,text
0,
1,😍😍🔥🔥🔥
2,patm
3,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...
4,@amir_niarashid 💩🤣🖕


Convert text to lowercase

In [184]:
data["text"] = data["text"].str.lower()

Remove punctuation

In [185]:
data["text"] = data["text"].str.replace("[^\w\s]", "")

Remove stop words

In [186]:
stop_words = set(stopwords.words("english"))
data["text"] = data["text"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

In [210]:
data = data[data["text"].str.strip() != ""]

In [211]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09 12:32:20+00:00,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 !,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09 13:14:43+00:00,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09 13:15:08+00:00,0
5,https://www.instagram.com/p/Cz67N84Pezn/,18118091365334462,nice arms,st3ph_3s0n,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09 20:37:29+00:00,0
7,https://www.instagram.com/p/Cz67N84Pezn/,18247697080171707,gostoso,pedr0_hgc,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-10 01:28:48+00:00,0


## perplexity

In [555]:
def calculate_perplexity(probabilities):
    if not isinstance(probabilities, list):
        raise TypeError("Input must be a list of probabilities.")

    if not probabilities:
        raise ValueError("The list of probabilities is empty.")

    if not all(0 <= prob <= 1 for prob in probabilities):
        raise ValueError("All probabilities must be in the range [0, 1].")

    N = len(probabilities)
    log_prob_sum = sum(math.log2(prob) for prob in probabilities if prob > 0)

    if log_prob_sum == 0:
        raise ValueError(
            "All probabilities are zero, resulting in infinite perplexity."
        )

    perplexity = 2 ** (-1 / N * log_prob_sum)

    return perplexity

## Unigram Language Model

### Introduction 

A unigram language model is a simple yet powerful statistical model used in natural language processing (NLP) to predict the probability of occurrence of a word in a sequence of words. It assumes that the probability of a word appearing in a sentence is independent of the context in which it appears. This assumption, while not entirely accurate, often provides a reasonable approximation of the true word distribution in a language.

### Implementation

Create a CountVectorizer

In [188]:
vectorizer = CountVectorizer()

Fit and transform the text data

In [189]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (words)

In [190]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each word (unigram)

In [191]:
word_counts = X.sum(axis=0)

Create a DataFrame to display the word counts

In [192]:
unigram_data = pd.DataFrame({"Word": feature_names, "Count": word_counts.A1})

Sort the DataFrame by word counts

In [193]:
unigram_data = unigram_data.sort_values(by="Count", ascending=False)

Display the top words

In [194]:
unigram_data.head(10)

Unnamed: 0,Word,Count
401,cbum,72
1831,que,55
592,de,42
325,bro,32
1946,se,32
1314,like,29
2248,um,25
725,el,23
2008,sim,18
727,ele,18


### unigram Smoothing

Get the counts for each unigram

In [563]:
unigram_counts = X.sum(axis=0)

Add-one (Laplace) smoothing

In [564]:
total_features = len(feature_names)
smoothed_counts = unigram_counts + 1
total_tokens = smoothed_counts.sum()

Calculate smoothed probabilities

In [565]:
smoothed_probabilities = smoothed_counts / total_tokens

Create a DataFrame to display the smoothed unigram probabilities

In [566]:
smoothed_unigram_model = pd.DataFrame(
    {
        "Unigram": feature_names,
        "Count": smoothed_counts.A1,
        "Probability": smoothed_probabilities.A1,
    }
)

Sort the DataFrame by unigram counts

In [567]:
smoothed_unigram_model = smoothed_unigram_model.sort_values(by="Count", ascending=False)

Display the top unigrams with smoothed probabilities

In [568]:
smoothed_unigram_model.head(10)

Unnamed: 0,Unigram,Count,Probability
316,cbum,72,0.011091
1653,que,56,0.008626
491,de,43,0.006624
244,bro,35,0.005391
1765,se,32,0.004929
1164,like,31,0.004775
1191,look,29,0.004467
2036,um,26,0.004005
615,el,24,0.003697
1199,love,21,0.003235


## bigram Language Model

### Introduction

A bigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the word that precedes it. Unlike unigram models, which assume that words occur independently of each other, bigram models take into account the sequential nature of language.

### Implementation

Create a CountVectorizer for bigrams

In [195]:
vectorizer = CountVectorizer(ngram_range=(2, 2))

Fit and transform the text data

In [196]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (bigrams)

In [197]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each bigram

In [198]:
bigram_counts = X.sum(axis=0)

Create a DataFrame to display the bigram counts

In [199]:
bigram_model = pd.DataFrame({"Bigram": feature_names, "Count": bigram_counts.A1})

Sort the DataFrame by bigram counts

In [200]:
bigram_model = bigram_model.sort_values(by="Count", ascending=False)

Display the top bigrams

In [201]:
bigram_model.head(10)

Unnamed: 0,Bigram,Count
2533,se quiser,15
2414,quiser sim,14
2637,sim mano,11
1860,meu comeo,7
1731,looks like,7
2958,um dia,6
1721,look like,5
1925,mr olympia,4
2773,tem potencial,4
1352,hair transplant,4


### bigram Smoothing

Get the counts for each bigram

In [576]:
bigram_counts = X.sum(axis=0)

Add-one (Laplace) smoothing

In [577]:
total_bigrams = len(feature_names)
smoothed_counts = bigram_counts + 1
total_tokens = smoothed_counts.sum()

Calculate smoothed probabilities

In [578]:
smoothed_probabilities = smoothed_counts / total_tokens

 Create a DataFrame to display the smoothed bigram probabilities

In [579]:
smoothed_bigram_model = pd.DataFrame(
    {
        "Bigram": feature_names,
        "Count": smoothed_counts.A1,
        "Probability": smoothed_probabilities.A1,
    }
)

Sort the DataFrame by bigram counts

In [580]:
smoothed_bigram_model = smoothed_bigram_model.sort_values(by="Count", ascending=False)

Display the top bigrams with smoothed probabilities

In [581]:
smoothed_bigram_model.head(10)

Unnamed: 0,Bigram,Count,Probability
2420,se quiser,16,0.002474
2301,quiser sim,15,0.002319
1639,look like,14,0.002165
2516,sim mano,12,0.001856
1758,meu comeo,8,0.001237
2830,um dia,7,0.001082
1185,gon na,6,0.000928
2257,que lo,5,0.000773
2736,todo lo,5,0.000773
1819,mr olympia,5,0.000773


## trigram Language Model

### Introduction

A trigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the two preceding words. It takes into account the sequential nature of language by considering the dependencies between three consecutive words.

### Implementation

Create a CountVectorizer for trigrams

In [202]:
vectorizer = CountVectorizer(ngram_range=(3, 3))

Fit and transform the text data

In [203]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (trigrams)

In [204]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each trigram

In [205]:
trigram_counts = X.sum(axis=0)

Create a DataFrame to display the trigram counts

In [206]:
trigram_df = pd.DataFrame({"Trigram": feature_names, "Count": trigram_counts.A1})

Sort the DataFrame by trigram counts

In [207]:
trigram_df = trigram_df.sort_values(by="Count", ascending=False)

Display the top trigrams

In [208]:
trigram_df.head(10)

Unnamed: 0,Trigram,Count
2092,se quiser sim,13
1988,quiser sim mano,11
1321,lembra muito meu,3
1769,pensou em competir,3
1320,lembra meu comeo,3
1945,que los anabolicos,2
307,cabeza de bolo,2
2611,work people use,2
62,activates hormone destroys,2
2367,told would work,2


### trigram smoothing

Create a CountVectorizer for trigrams

In [589]:
vectorizer = CountVectorizer(ngram_range=(3, 3))

Fit and transform the text data

In [590]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (trigrams)

In [591]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each trigram

In [592]:
trigram_counts = X.sum(axis=0)

Add-one (Laplace) smoothing

In [593]:
total_trigrams = len(feature_names)
smoothed_counts = trigram_counts + 1
total_tokens = smoothed_counts.sum()

Calculate smoothed probabilities

In [594]:
smoothed_probabilities = smoothed_counts / total_tokens

Create a DataFrame to display the smoothed trigram probabilities

In [595]:
smoothed_trigram_model = pd.DataFrame(
    {
        "Trigram": feature_names,
        "Count": smoothed_counts.A1,
        "Probability": smoothed_probabilities.A1,
    }
)

Sort the DataFrame by trigram counts

In [596]:
smoothed_trigram_model = smoothed_trigram_model.sort_values(by="Count", ascending=False)

Display the top trigrams with smoothed probabilities

In [597]:
smoothed_trigram_model.head(10)

Unnamed: 0,Trigram,Count,Probability
2012,se quiser sim,14,0.002666
1910,quiser sim mano,12,0.002285
1253,lembra muito meu,4,0.000762
1252,lembra meu comeo,4,0.000762
1694,pensou em competir,4,0.000762
1515,mujer la madr,3,0.000571
1081,hill canada costco,3,0.000571
1992,say smith machin,3,0.000571
2449,violenc costco richmond,3,0.000571
2052,ser esa mujer,3,0.000571
