## Import Libraries

In [1]:
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    CountVectorizer,
)
from sklearn.preprocessing import Normalizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import nltk
import re
import itertools
from langdetect import detect

## Data Preprocessing

load the data

In [2]:
data = pd.read_csv("./../Q1/data.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1157 non-null   object
 1   id                  1157 non-null   int64 
 2   text                1156 non-null   object
 3   ownerUsername       1157 non-null   object
 4   ownerProfilePicUrl  1157 non-null   object
 5   timestamp           1157 non-null   object
 6   likesCount          1157 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 63.4+ KB


In [4]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cz67N84Pezn/,17981320055390052,,n4i1er,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T01:46:41.000Z,0
1,https://www.instagram.com/p/Cz67N84Pezn/,18225095332247021,😍😍🔥🔥🔥,farid.zand1997,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T09:30:36.000Z,0
2,https://www.instagram.com/p/Cz67N84Pezn/,18016249762974314,patm,andreprivet_,https://scontent-dub4-1.cdninstagram.com/v/t51...,2023-12-09T12:32:20.000Z,0
3,https://www.instagram.com/p/Cz67N84Pezn/,17980123316614267,@hoccein_hemati68 این کوصکش تو ایران بود نهایت...,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:14:43.000Z,0
4,https://www.instagram.com/p/Cz67N84Pezn/,18036756691566765,@amir_niarashid 💩🤣🖕,kurd___boy666,https://scontent-lga3-2.cdninstagram.com/v/t51...,2023-12-09T13:15:08.000Z,0


Drop duplicate rows if any

In [26]:
data.drop_duplicates(inplace=True , subset=["text"])

Unnamed: 0,text


Keep only the necessary columns for your NLP task

In [6]:
data = data[["text"]]

Convert text to lowercase

In [7]:
data["text"] = data["text"].str.lower()

Remove URLs

In [8]:
data["text"] = data["text"].apply(
    lambda x: re.sub(r"https?://\S+|www\.\S+", "", str(x))
)

Remove HTML tags

In [9]:
data["text"] = data["text"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

  data["text"] = data["text"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())


Remove special characters, numbers, and punctuations

In [10]:
data["text"] = data["text"].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))

Tokenization

In [11]:
data["text"] = data["text"].apply(lambda x: word_tokenize(x))

Remove stop words

In [12]:
stop_words = set(nltk.corpus.stopwords.words("english"))

In [13]:
data["text"] = data["text"].apply(
    lambda x: " ".join([word for word in str(x).split() if word not in stop_words])
)

Stemming (you can also consider lemmatization)

In [14]:
ps = PorterStemmer()
data["text"] = data["text"].apply(lambda x: [ps.stem(word) for word in x])

Combine tokens back into text

In [15]:
data["text"] = data["text"].apply(lambda x: " ".join(x))

Remove mentions from the texts

In [16]:

mentions_pattern = (r"(@\S+)",)
data["text"] = data["text"].astype(str)
data = data[data["text"].str.islower()]
data["text"] = data["text"].str.replace(mentions_pattern, "")

Remove texts that are written in a language other than English

In [17]:
def is_english(text):
    if pd.isna(text):  # Check for NaN values
        return False
    try:
        lang = detect(text)
        return lang == "en"
    except:
        return False

In [18]:
data["is_english"] = data["text"].apply(is_english)
data = data[data["is_english"]]
data = data.drop(columns=["is_english"])

Save the preprocessed data to a new CSV file

In [19]:
# data.to_csv("preprocessed_data.csv", index=False)
data.head()

Unnamed: 0,text


## Unigram Language Model

### Introduction 

A unigram language model is a simple yet powerful statistical model used in natural language processing (NLP) to predict the probability of occurrence of a word in a sequence of words. It assumes that the probability of a word appearing in a sentence is independent of the context in which it appears. This assumption, while not entirely accurate, often provides a reasonable approximation of the true word distribution in a language.

### Implementation

Create a CountVectorizer

In [20]:
vectorizer = CountVectorizer()

Fit and transform the text data

In [21]:
X = vectorizer.fit_transform(data["text"])

ValueError: empty vocabulary; perhaps the documents only contain stop words

Get the feature names (words)

In [None]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each word (unigram)

In [None]:
word_counts = X.sum(axis=0)

Create a DataFrame to display the word counts

In [None]:
unigram_data = pd.DataFrame({"Word": feature_names, "Count": word_counts.A1})

Sort the DataFrame by word counts

In [None]:
unigram_data = unigram_data.sort_values(by="Count", ascending=False)

Display the top words

In [None]:
unigram_data.head(10)

## bigram Language Model

### Introduction

A bigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the word that precedes it. Unlike unigram models, which assume that words occur independently of each other, bigram models take into account the sequential nature of language.

### Implementation

Create a CountVectorizer for bigrams

In [None]:
vectorizer = CountVectorizer(ngram_range=(2, 2))

Fit and transform the text data

In [None]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (bigrams)

In [None]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each bigram

In [None]:
bigram_counts = X.sum(axis=0)

Create a DataFrame to display the bigram counts

In [None]:
bigram_model = pd.DataFrame({"Bigram": feature_names, "Count": bigram_counts.A1})

Sort the DataFrame by bigram counts

In [None]:
bigram_model = bigram_model.sort_values(by="Count", ascending=False)

Display the top bigrams

In [None]:
bigram_model.head(10)

## three gram Language Model

### Introduction

A trigram language model is a statistical language model that predicts the probability of a word appearing in a sequence of words based on the two preceding words. It takes into account the sequential nature of language by considering the dependencies between three consecutive words.

### Implementation

Create a CountVectorizer for trigrams

In [None]:
vectorizer = CountVectorizer(ngram_range=(3, 3))

Fit and transform the text data

In [None]:
X = vectorizer.fit_transform(data["text"])

Get the feature names (trigrams)

In [None]:
feature_names = vectorizer.get_feature_names_out()

Get the counts for each trigram

In [None]:
trigram_counts = X.sum(axis=0)

Create a DataFrame to display the trigram counts

In [None]:
trigram_df = pd.DataFrame({"Trigram": feature_names, "Count": trigram_counts.A1})

Sort the DataFrame by trigram counts

In [None]:
trigram_df = trigram_df.sort_values(by="Count", ascending=False)

Display the top trigrams

In [None]:
trigram_df.head(10)