In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize 
from nltk import FreqDist
from nltk.corpus import stopwords
import demoji
import string

from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

##### regex patterns used for the data cleaning

In [2]:
url = r'(https?|ftp)://([^\s]+)'
hashtag = r'(#[^\s]+)'
mention = r'(@[^\s]*)'

#### Create the data frame from csv file

In [3]:
tw = pd.read_csv('tweets_v8.csv')
tw.head(20)
df = tw.copy()

##### Normalize the data

In [4]:
tw.text = tw.text.str.lower()

#### strip the text data

In [5]:
tw.text = tw.text.str.strip()

##### Removing the hashtags, Urls and mentions from the text column

In [6]:
# removing the urls from the text
tw.text = tw.text.replace(url,'',regex=True)

In [7]:
# removing the hashtags from the text
tw.text = tw.text.replace(hashtag,'',regex=True)

In [8]:
# removing mentions from the text
tw.text = tw.text.replace(mention,'',regex=True)

#### Removing Emojis

In [9]:
tw.text = tw.text.apply(lambda x : demoji.replace(x))

#### Removing stopwords from the the text


In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def remove_stopwords(text):
    return ' '.join([i for i in word_tokenize(text.lower()) if i not in stop_words ])

In [12]:
tw.text = tw.text.apply(remove_stopwords)

#### Removing punctuations

In [13]:
punc = string.punctuation + "‘’“”"

In [14]:
def remove_punc(text):
    
    for ch in text:
        if ch in punc:
            text = text.replace(ch,'')
    
    return text

In [15]:
tw.text = tw.text.apply(remove_punc)

#### Stripping the data

In [16]:
tw.text = tw.text.str.strip()

Stemming the data

In [17]:
strem1 = PorterStemmer()

In [18]:
tw.loc[:,'PorterStem'] = tw.text.apply(lambda text : ' '.join([ strem1.stem(i) for i in text.split()]))

In [19]:
tw.loc[:,['text','PorterStem']]

Unnamed: 0,text,PorterStem
0,life hits time poverty strikes gong yoo lets ...,life hit time poverti strike gong yoo let play...
1,marble episode ruined,marbl episod ruin
2,time,time
3,blood 1st slide m joining squidgame thing m a...,blood 1st slide m join squidgam thing m alread...
4,two first games players killed mask guys blo...,two first game player kill mask guy bloodi nig...
...,...,...
80014,yes yes yes,ye ye ye
80015,squid game reviewed revaaa review anything any...,squid game review revaaa review anyth anyon an...
80016,back amp forth squid game creator amp le...,back amp forth squid game creator amp lebron j...
80017,sort games think ll play inevitably make 2,sort game think ll play inevit make 2


In [23]:
tw.to_csv('cleaned_tweets.csv')