## Import Libraries

In [79]:
import pandas as pd
import numpy as np
import re
import nltk
from langdetect import detect_langs
import langid

## Data Preprocessing

load the data

In [80]:
data = pd.read_csv("./Project/data.csv")

In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   postUrl             1131 non-null   object
 1   id                  1131 non-null   int64 
 2   text                1130 non-null   object
 3   ownerUsername       1131 non-null   object
 4   ownerProfilePicUrl  1131 non-null   object
 5   timestamp           1131 non-null   object
 6   likesCount          1131 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 62.0+ KB


In [82]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
0,https://www.instagram.com/p/Cvxj4Rcga_i/,18000717011493106,Después los “profes” te dicen que no hagas la ...,nicopetacchi,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-03T16:44:14.000Z,0
1,https://www.instagram.com/p/Cvxj4Rcga_i/,18242597854171531,@bruno_viotti e natural,allanzinho.05,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-03T20:29:05.000Z,0
2,https://www.instagram.com/p/Cvxj4Rcga_i/,18028561504679119,😍😍😍😍🔥🔥🔥🔥🔥,amandamoon764,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-03T21:39:47.000Z,1
3,https://www.instagram.com/p/Cvxj4Rcga_i/,17989361699457631,,uzakbay02,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-04T04:46:50.000Z,0
4,https://www.instagram.com/p/Cvxj4Rcga_i/,17909218049778385,@yonatanbargay @adam_mayer_21 סמית זה עדיין חרא?,yuval_viener,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-04T13:57:58.000Z,0


Remove duplicate texts

In [83]:
data = data.drop_duplicates(subset=["text"])

Remove texts that are empty or contain only whitespace

In [84]:
data = data[data["text"].str.strip() != ""]

Convert all comments to lowercase

In [85]:
data["text"] = data["text"].str.lower()

Remove punctuation and special characters from the comments

In [86]:
data["text"] = data["text"].str.replace("[^\w\s]", "")

Remove stop words from the texts

In [87]:
stop_words = set(nltk.corpus.stopwords.words("english"))

In [88]:
data["text"] = data["text"].apply(
    lambda x: " ".join([word for word in str(x).split() if word not in stop_words])
)

Remove texts that are less than a certain length

In [89]:
min_length = 3
data = data[data["text"].str.len() >= min_length]

Remove texts that contain certain keywords

In [90]:
keywords = ["spam", "advertisement", "offensive"]
data = data[~data["text"].str.contains("|".join(keywords))]

Remove texts that are written in a language other than English

In [91]:
def is_english(text):
    lang, _ = langid.classify(text)
    return lang == "en"

In [92]:
data["is_english"] = data["text"].apply(is_english)
data = data[data["is_english"]]

In [93]:
data = data.drop(columns=["is_english"])

Remove mentions from the texts

In [94]:
mentions_pattern = r"(@\w+)"
data["text"] = data["text"].str.replace(mentions_pattern, "")

In [96]:
data.head()

Unnamed: 0,postUrl,id,text,ownerUsername,ownerProfilePicUrl,timestamp,likesCount
3,https://www.instagram.com/p/Cvxj4Rcga_i/,17989361699457631,,uzakbay02,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-04T04:46:50.000Z,0
8,https://www.instagram.com/p/Cvxj4Rcga_i/,17996899703086912,gym good music whatttttt,esmesiren,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-04T20:29:03.000Z,0
13,https://www.instagram.com/p/Cvxj4Rcga_i/,18058055113471101,want put guy badass movie,fleury_muhire,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-06T16:13:11.000Z,0
32,https://www.instagram.com/p/Cvxj4Rcga_i/,17874358640947700,cold winds blows back 🙌,emile_girardin,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-08T04:27:32.000Z,0
33,https://www.instagram.com/p/Cvxj4Rcga_i/,18021367558894735,@kenniemalm,ryanaryan,https://instagram.fbts7-1.fna.fbcdn.net/v/t51....,2023-11-15T22:11:38.000Z,0
