## Import Libraries

In [432]:
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    CountVectorizer,
)
from sklearn.preprocessing import Normalizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import nltk
import re

## Data Preprocessing

load the data

In [433]:
data = pd.read_csv("./../Q1/data.csv")

In [434]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1157 non-null   object
 1   id                  1157 non-null   int64 
 2   text                1156 non-null   object
 3   ownerUsername       1157 non-null   object
 4   ownerProfilePicUrl  1157 non-null   object
 5   timestamp           1157 non-null   object
 6   likesCount          1157 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 63.4+ KB


In [435]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cz67N84Pezn/,17981320055390052,,n4i1er,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T01:46:41.000Z,0
1,https://www.instagram.com/p/Cz67N84Pezn/,18225095332247021,😍😍🔥🔥🔥,farid.zand1997,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T09:30:36.000Z,0
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T12:32:20.000Z,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:14:43.000Z,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid 💩🤣🖕,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:15:08.000Z,0


Drop duplicate rows if any

In [436]:
data.drop_duplicates(inplace=True)

Keep only the necessary columns for your NLP task

In [437]:
data = data[["text"]]

Convert text to lowercase

In [438]:
data["text"] = data["text"].str.lower()

Remove URLs

In [439]:
data["text"] = data["text"].apply(
    lambda x: re.sub(r"https?://\S+|www\.\S+", "", str(x))
)

Remove HTML tags

In [440]:
data["text"] = data["text"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

  data["text"] = data["text"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())


Remove special characters, numbers, and punctuations

In [441]:
data["text"] = data["text"].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))

Tokenization

In [442]:
data["text"] = data["text"].apply(lambda x: word_tokenize(x))

Remove stop words

In [443]:
stop_words = set(stopwords.words("english"))
data["text"] = data["text"].apply(
    lambda x: [word for word in x if word not in stop_words]
)

Stemming (you can also consider lemmatization)

In [444]:
ps = PorterStemmer()
data["text"] = data["text"].apply(lambda x: [ps.stem(word) for word in x])

Combine tokens back into text

In [445]:
data["text"] = data["text"].apply(lambda x: " ".join(x))

Save the preprocessed data to a new CSV file

In [446]:
data.to_csv("preprocessed_data.csv", index=False)

## Unigram Language Model

### Introduction 

A unigram language model is a simple yet powerful statistical model used in natural language processing (NLP) to predict the probability of occurrence of a word in a sequence of words. It assumes that the probability of a word appearing in a sentence is independent of the context in which it appears. This assumption, while not entirely accurate, often provides a reasonable approximation of the true word distribution in a language.

### Implementation

Create a CountVectorizer

In [447]:
vectorizer = CountVectorizer()

Fit and transform the text data

In [448]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (words)

In [449]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each word (unigram)

In [450]:
word_counts = X.sum(axis=0)

Create a DataFrame to display the word counts

In [451]:
unigram_data = pd.DataFrame({"Word": feature_names, "Count": word_counts.A1})

Sort the DataFrame by word counts

In [452]:
unigram_data = unigram_data.sort_values(by="Count", ascending=False)

Display the top words

In [453]:
unigram_data.head(10)

Unnamed: 0,Word,Count
316,cbum,71
1654,que,55
491,de,42
244,bro,34
1766,se,31
1164,like,30
1191,look,28
2037,um,25
615,el,23
1199,love,20


## bigram Language Model

### Introduction

A bigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the word that precedes it. Unlike unigram models, which assume that words occur independently of each other, bigram models take into account the sequential nature of language.

### Implementation

Create a CountVectorizer for bigrams

In [454]:
vectorizer = CountVectorizer(ngram_range=(2, 2))

Fit and transform the text data

In [455]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (bigrams)

In [456]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each bigram

In [457]:
bigram_counts = X.sum(axis=0)

Create a DataFrame to display the bigram counts

In [458]:
bigram_model = pd.DataFrame({"Bigram": feature_names, "Count": bigram_counts.A1})

Sort the DataFrame by bigram counts

In [459]:
bigram_model = bigram_model.sort_values(by="Count", ascending=False)

Display the top bigrams

In [460]:
bigram_model.head(10)

Unnamed: 0,Bigram,Count
2420,se quiser,15
2301,quiser sim,14
1639,look like,13
2516,sim mano,11
1758,meu comeo,7
2830,um dia,6
1185,gon na,5
2257,que lo,4
2736,todo lo,4
1819,mr olympia,4


## three gram Language Model

### Introduction

A trigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the two preceding words. It takes into account the sequential nature of language by considering the dependencies between three consecutive words.

### Implementation

Create a CountVectorizer for trigrams

In [461]:
vectorizer = CountVectorizer(ngram_range=(3, 3))

Fit and transform the text data

In [462]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (trigrams)

In [463]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each trigram

In [464]:
trigram_counts = X.sum(axis=0)

Create a DataFrame to display the trigram counts

In [465]:
trigram_df = pd.DataFrame({"Trigram": feature_names, "Count": trigram_counts.A1})

Sort the DataFrame by trigram counts

In [466]:
trigram_df = trigram_df.sort_values(by="Count", ascending=False)

Display the top trigrams

In [467]:
trigram_df.head(10)

Unnamed: 0,Trigram,Count
2012,se quiser sim,13
1910,quiser sim mano,11
1253,lembra muito meu,3
1252,lembra meu comeo,3
1694,pensou em competir,3
1515,mujer la madr,2
1081,hill canada costco,2
1992,say smith machin,2
2449,violenc costco richmond,2
2052,ser esa mujer,2
