In [79]:
import pandas as pd
from nltk.tokenize import word_tokenize 
from nltk import FreqDist
from nltk.corpus import stopwords
import demoji
import string

##### regex patterns used for the data cleaning

In [39]:
url = r'(https?|ftp)://([^\s]+)'
hashtag = r'(#[^\s]+)'
mention = r'(@[^\s]*)'

In [40]:
# txt = 'https://t.co/BQgwd91ODK'
# hash = '#IlovePythoon17665'
# tag = '@m0hit.028'
# import re
# re.findall(url,txt)

#### Create the data frame from csv file

In [41]:
tw = pd.read_csv('tweets_v8.csv')
tw.head(20)

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,source,is_retweet
0,the _√ªnd√´r-rat√®d nigg√°hüëäüèæ,,@ManUtd die hard‚ù§Ô∏è‚ù§Ô∏èüí™üèøüí™üèø\n\n\nYOLO\n\n\nJ'ai b...,2019-09-06 19:24:57+00:00,581,1035,8922,False,2021-10-06 12:05:38+00:00,When life hits and the same time poverty strik...,Twitter for Android,False
1,Best uncle on planet earth,,,2013-05-08 19:35:26+00:00,741,730,8432,False,2021-10-06 12:05:22+00:00,That marble episode of #SquidGame ruined me. üò≠üò≠üò≠,Twitter for Android,False
2,marcie,,animal crossing. chicken nuggets. baby yoda. s...,2009-02-21 10:31:30+00:00,562,1197,62732,False,2021-10-06 12:05:22+00:00,#Squidgame time,Twitter Web App,False
3,YoMo.Mdp,Any pronouns,Where the heck is the karma\nI'm going on my s...,2021-02-14 13:21:22+00:00,3,277,1341,False,2021-10-06 12:05:04+00:00,//Blood on 1st slide\nI'm joining the squidgam...,Twitter Web App,False
4,Laura Reactions,France,I talk and I make reactions videos about shows...,2018-12-19 20:38:28+00:00,330,152,2278,False,2021-10-06 12:05:00+00:00,"The two first games, players were killed by th...",Twitter Web App,False
5,Peyman üÖöüÖêüÖò,United Kingdom,Official @KardiaChain $KAI Ambassador\nMarketi...,2018-01-27 12:07:31+00:00,546,318,6265,False,2021-10-06 12:04:54+00:00,$THG\nGoing to explode to 4B Marketcap very so...,Twitter for Android,False
6,Aeriaaaa‚ô°,,Fujoshi üôà/ Thai BL-obsessed/Always distracted ...,2021-06-01 14:08:10+00:00,14,110,518,False,2021-10-06 12:04:45+00:00,@B_hundred_Hyun pls use that gun on me. üò© \n\n...,Twitter for Android,False
7,BarBiE F√°bregas üáøüá¶,South Africa,Legal Administratorüë©üèæ‚Äçüíª|Soccer Fanatic‚öΩÔ∏è #Dail...,2011-03-28 18:56:28+00:00,1877,2057,33186,False,2021-10-06 12:04:26+00:00,Please vote in my daily poll. \nThanks. üòä\n\nD...,Twitter for iPhone,False
8,Joel D. Parker,,Ph.D. in history of 20th century Levant. Somet...,2010-07-01 07:26:44+00:00,621,1346,5345,False,2021-10-06 12:04:22+00:00,I've seen bi-lingual Korean speakers slam the ...,Twitter Web App,False
9,Kevin Franco,"Calgary, Canada",When it comes to describing myself in a one li...,2009-05-11 20:14:51+00:00,1027,278,20902,False,2021-10-06 12:04:10+00:00,I discovered English audio after 5 episodes of...,Twitter Web App,False


##### Normalize the data

In [55]:
tw.text = tw.text.str.lower()

#### strip the text data

In [42]:
tw.text = tw.text.str.strip()

##### Removing the hashtags, Urls and mentions from the text column

In [43]:
# removing the urls from the text
tw.text = tw.text.str.replace(url,'',regex=True)

In [44]:
# removing hashtags from the text
tw.text = tw.text.replace(hashtag,'',regex=True)

In [45]:
# removing mentions from the text
tw.text = tw.text.replace(mention,'',regex=True)

##### Removing emojis

In [49]:
tw.text = tw.text.apply(lambda x: demoji.replace(x))

##### Removing stopwords

In [54]:
stop_words = set(stopwords.words('english'))

In [56]:
def remove_stopwords(text):
    return " ".join([i for i in word_tokenize(text.lower()) if i not in stop_words])

In [57]:
tw.text = tw.text.apply(remove_stopwords)

##### Removing punctuations

In [70]:
punc = string.punctuation + "‚Äò‚Äô‚Äú‚Äù"
punc


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‚Äò‚Äô‚Äú‚Äù'

In [71]:
def remove_punc(text):
    
    for ch in text:
        if ch in punc:
            text = text.replace(ch,'')
    
    return text

In [72]:
tw.text = tw.text.apply(remove_punc)

In [76]:
def find_punc(text):
    
    for ch in text:
        if ch in punc:
            return True

In [77]:
tw.text.apply(find_punc).any()

False

#### strip the data again

In [78]:
tw.text = tw.text.str.strip()

##### Count the most freq words from the dataset

In [82]:
tokens = []

for i in tw.text:
    tokens += word_tokenize(i)
    
tokens

['life',
 'hits',
 'time',
 'poverty',
 'strikes',
 'gong',
 'yoo',
 'lets',
 'play',
 'game',
 'marble',
 'episode',
 'ruined',
 'time',
 'blood',
 '1st',
 'slide',
 'm',
 'joining',
 'squidgame',
 'thing',
 'm',
 'already',
 'dead',
 'sugar',
 'honeycomb',
 'ofc',
 'two',
 'first',
 'games',
 'players',
 'killed',
 'mask',
 'guys',
 'bloody',
 'night',
 'third',
 'game',
 'killed',
 'o‚Ä¶',
 'thg',
 'going',
 'explode',
 '4b',
 'marketcap',
 'soon',
 'world',
 'first',
 'moba',
 'game',
 'another',
 'level',
 'pls',
 'use',
 'gun',
 'please',
 'vote',
 'daily',
 'poll',
 'thanks',
 'think',
 'donny',
 'van',
 'de',
 'beek',
 'leave',
 'manchester',
 'united',
 'yes',
 '‚Ä¶',
 've',
 'seen',
 'bilingual',
 'korean',
 'speakers',
 'slam',
 'translation',
 'could',
 'feel',
 'quality',
 'writing',
 'd‚Ä¶',
 'discovered',
 'english',
 'audio',
 '5',
 'episodes',
 'squid',
 'game',
 'korean',
 'subtitles',
 'm',
 'used',
 'watching',
 'foreign',
 'fi‚Ä¶',
 'struggle',
 'real',
 '‚Äî',
 's

In [84]:
common = FreqDist(tokens).most_common(20)
common

[('game', 14550),
 ('squid', 12459),
 ('s', 5148),
 ('episode', 4278),
 ('watching', 3990),
 ('like', 3960),
 ('project', 3906),
 ('nt', 3376),
 ('one', 3214),
 ('show', 3004),
 ('watch', 3004),
 ('good', 2718),
 ('light', 2531),
 ('finished', 2496),
 ('watched', 2466),
 ('get', 2435),
 ('netflix', 2322),
 ('‚Ä¶', 2223),
 ('games', 2045),
 ('would', 2016)]