In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize 
from nltk import FreqDist
from nltk.corpus import stopwords
import demoji
import string

##### regex patterns used for the data cleaning

In [2]:
url = r'(https?|ftp)://([^\s]+)'
hashtag = r'(#[^\s]+)'
mention = r'(@[^\s]*)'

In [3]:
# txt = 'https://t.co/BQgwd91ODK'
# hash = '#IlovePythoon17665'
# tag = '@m0hit.028'
# import re
# re.findall(url,txt)

#### Create the data frame from csv file

In [4]:
tw = pd.read_csv('tweets_v8.csv')
tw.head(20)
df = tw.copy()

##### Normalize the data

In [5]:
tw.text = tw.text.str.lower()

#### strip the text data

In [6]:
tw.text = tw.text.str.strip()

##### Removing the hashtags, Urls and mentions from the text column

In [7]:
# removing the urls from the text
tw.text = tw.text.str.replace(url,'',regex=True)

In [8]:
# removing hashtags from the text
tw.text = tw.text.replace(hashtag,'',regex=True)

In [9]:
# removing mentions from the text
tw.text = tw.text.replace(mention,'',regex=True)

##### Removing emojis

In [10]:
tw.text = tw.text.apply(lambda x: demoji.replace(x))

##### Removing stopwords

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
def remove_stopwords(text):
    return " ".join([i for i in word_tokenize(text.lower()) if i not in stop_words])

In [13]:
tw.text = tw.text.apply(remove_stopwords)

##### Removing punctuations

In [14]:
punc = string.punctuation + "‘’“”"
punc


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’“”'

In [15]:
def remove_punc(text):
    
    for ch in text:
        if ch in punc:
            text = text.replace(ch,'')
    
    return text

In [16]:
tw.text = tw.text.apply(remove_punc)

In [17]:
def find_punc(text):
    
    for ch in text:
        if ch in punc:
            return True

In [18]:
tw.text.apply(find_punc).any()

False

#### strip the data again

In [19]:
tw.text = tw.text.str.strip()

##### Count the most freq words from the dataset

In [20]:
tokens = []

for i in tw.text:
    tokens += word_tokenize(i)
    
tokens

['life',
 'hits',
 'time',
 'poverty',
 'strikes',
 'gong',
 'yoo',
 'lets',
 'play',
 'game',
 'marble',
 'episode',
 'ruined',
 'time',
 'blood',
 '1st',
 'slide',
 'm',
 'joining',
 'squidgame',
 'thing',
 'm',
 'already',
 'dead',
 'sugar',
 'honeycomb',
 'ofc',
 'two',
 'first',
 'games',
 'players',
 'killed',
 'mask',
 'guys',
 'bloody',
 'night',
 'third',
 'game',
 'killed',
 'o…',
 'thg',
 'going',
 'explode',
 '4b',
 'marketcap',
 'soon',
 'world',
 'first',
 'moba',
 'game',
 'another',
 'level',
 'pls',
 'use',
 'gun',
 'please',
 'vote',
 'daily',
 'poll',
 'thanks',
 'think',
 'donny',
 'van',
 'de',
 'beek',
 'leave',
 'manchester',
 'united',
 'yes',
 '…',
 've',
 'seen',
 'bilingual',
 'korean',
 'speakers',
 'slam',
 'translation',
 'could',
 'feel',
 'quality',
 'writing',
 'd…',
 'discovered',
 'english',
 'audio',
 '5',
 'episodes',
 'squid',
 'game',
 'korean',
 'subtitles',
 'm',
 'used',
 'watching',
 'foreign',
 'fi…',
 'struggle',
 'real',
 '—',
 'squid',
 'g

In [21]:
common = FreqDist(tokens).most_common(20)
common

[('game', 14550),
 ('squid', 12459),
 ('s', 5148),
 ('episode', 4278),
 ('watching', 3990),
 ('like', 3960),
 ('project', 3906),
 ('nt', 3376),
 ('one', 3214),
 ('show', 3004),
 ('watch', 3004),
 ('good', 2718),
 ('light', 2531),
 ('finished', 2496),
 ('watched', 2466),
 ('get', 2435),
 ('netflix', 2322),
 ('…', 2223),
 ('games', 2045),
 ('would', 2016)]

In [22]:
tw.text.str.find('‘')

0       -1
1       -1
2       -1
3       -1
4       -1
        ..
80014   -1
80015   -1
80016   -1
80017   -1
80018   -1
Name: text, Length: 80019, dtype: int64

In [23]:
tw.text.to_string(index=False)



In [24]:
tw.text[tw.text.str.find('nt') != -1]

14                saga continues get data cheap message us
21                          could nt resist sketching loml
31                         nt mean push  squid game roblox
33       ladies gentlemen   s chibi art kang saebyeok s...
39       esp  tug war  episode brilliant writing screen...
                               ...                        
79969    finally watched bad  twist end brilliant  dedu...
79972    excellent project  good luck team  believe fai...
79996                            know girl  please comment
80012                        sort parent let s child watch
80016    back  amp  forth  squid game  creator  amp  le...
Name: text, Length: 15774, dtype: object

In [25]:
df.loc[80016,'text']

'Back &amp; Forth Between ‘Squid Game’ Creator &amp; Lebron James Have Gotten The Attention Of Netizens! \n\n#kwave #kdrama… https://t.co/oKinHECBDP'

In [26]:
tw.loc[80016,'text']

'back  amp  forth  squid game  creator  amp  lebron james gotten attention netizens'

In [27]:
v = df.loc[31, 'text']
word_tokenize(v)
v.split()

['I',
 "Didn't",
 'mean',
 'to',
 'push',
 'her',
 'off',
 '-',
 'Squid',
 'Game',
 'Roblox',
 'https://t.co/J6olOsZMj5',
 '@youtube',
 '@YouTubeGaming',
 '@roblox',
 '#roblox…',
 'https://t.co/oE9P1UU2iE']

In [28]:
' didn\'t' in stop_words

False

In [29]:
'couldn\'t' in stop_words

True

In [30]:
'didnt' in stop_words

False

In [31]:
s = " couldn't come hasn't haven't tomorrow aren't"
import re
re.sub(r"\s[\w]+n't[\s]*", '', s)

"comehaven't tomorrow"

In [32]:
'come' in stop_words

False

In [33]:
'not' in stop_words

True