In [None]:
!pip install spacy

In [6]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
# Reading the dataset
df = pd.read_csv("demonetization-tweets.csv", encoding= 'unicode_escape')
df.head()

Unnamed: 0,X,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,1,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,11/23/16 18:40,False,,8.01496e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,2,RT @Hemant_80: Did you vote on #Demonetization...,False,0,,11/23/16 18:40,False,,8.01496e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False
2,3,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,11/23/16 18:40,False,,8.01496e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
3,4,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0,,11/23/16 18:39,False,,8.01496e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False
4,5,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,11/23/16 18:39,False,,8.01495e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False


Lower Casing
It is the most common and simplest text preprocessing technique. Applicable to most text mining and NLP problems. The main goal is to convert the text into the lower casing so that ‘apple’, ‘Apple’ and ‘APPLE’ are treated the same way.

In [7]:
# Lower Casing --> creating new column called text_lower
df['text_lower']  = df['text'].str.lower()
df['text_lower'].head()

0    rt @rssurjewala: critical question: was paytm ...
1    rt @hemant_80: did you vote on #demonetization...
2    rt @roshankar: former finsec, rbi dy governor,...
3    rt @ani_news: gurugram (haryana): post office ...
4    rt @satishacharya: reddy wedding! @mail_today ...
Name: text_lower, dtype: object

In [9]:
#Removal of Punctuations
#removing punctuation, creating a new column called 'text_punct]'
df['text_punct'] = df['text_lower'].str.replace('[^\w\s]','')
df['text_punct'].head()

0    rt rssurjewala critical question was paytm inf...
1    rt hemant_80 did you vote on demonetization on...
2    rt roshankar former finsec rbi dy governor cbd...
3    rt ani_news gurugram haryana post office emplo...
4    rt satishacharya reddy wedding mail_today cart...
Name: text_punct, dtype: object

Stop-word removal
Stop words are a set of commonly used words in a language. Examples of stop words in English are “a”, “we”, “the”, “is”, “are” and etc. The idea behind using stop words is that, by removing low information words from text, we can focus on the important words instead. We can either create a custom list of stopwords ourselves (based on use case) or we can use predefined libraries.

In [10]:
#Importing stopwords from nltk library
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# Function to remove the stopwords
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
# Applying the stopwords to 'text_punct' and store into 'text_stop'
df["text_stop"] = df["text_punct"].apply(stopwords)
df["text_stop"].head()

0    rt rssurjewala critical question paytm informe...
1     rt hemant_80 vote demonetization modi survey app
2    rt roshankar former finsec rbi dy governor cbd...
3    rt ani_news gurugram haryana post office emplo...
4    rt satishacharya reddy wedding mail_today cart...
Name: text_stop, dtype: object

Common word removal
We can also remove commonly occurring words from our text data First, let’s check the 10 most frequently occurring words in our text data.

In [11]:
# Checking the first 10 most frequent words
from collections import Counter
cnt = Counter()
for text in df["text_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('demonetization', 13956),
 ('rt', 11059),
 ('india', 2766),
 ('modi', 2759),
 ('pm', 2730),
 ('narendra', 1566),
 ('rich', 1509),
 ('find', 1422),
 ('dear', 1411),
 ('implement', 1399)]

Now, we can remove the frequent words in the given corpus. This can be taken care automatically if we use tf-idf

In [12]:
# Removing the frequent words
freq = set([w for (w, wc) in cnt.most_common(10)])
# function to remove the frequent words
def freqwords(text):
    return " ".join([word for word in str(text).split() if word not 
in freq])
# Passing the function freqwords
df["text_common"] = df["text_stop"].apply(freqwords)
df["text_common"].head()

0    rssurjewala critical question paytm informed e...
1                            hemant_80 vote survey app
2    roshankar former finsec rbi dy governor cbdt c...
3    ani_news gurugram haryana post office employee...
4    satishacharya reddy wedding mail_today cartoon...
Name: text_common, dtype: object

In [67]:
# OTHER OPTION - using NLTK TO GET FREQUECY

In [43]:
# CREATE TOKENS
# Importing necessary library
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
# sample text for performing tokenization
text = "In Brazil they drive on the right-hand side of the road. Brazil has a large coastline on the easternside of South America"
# importing word_tokenize from nltk
from nltk.tokenize import word_tokenize
# Passing the string text into word tokenize for breaking the sentences
token = word_tokenize(text)
token

['In',
 'Brazil',
 'they',
 'drive',
 'on',
 'the',
 'right-hand',
 'side',
 'of',
 'the',
 'road',
 '.',
 'Brazil',
 'has',
 'a',
 'large',
 'coastline',
 'on',
 'the',
 'easternside',
 'of',
 'South',
 'America']

In [44]:
# finding the frequency distinct in the tokens
# Importing FreqDist library from nltk and passing token into FreqDist
from nltk.probability import FreqDist
fdist = FreqDist(token)
fdist

FreqDist({'the': 3, 'Brazil': 2, 'on': 2, 'of': 2, 'In': 1, 'they': 1, 'drive': 1, 'right-hand': 1, 'side': 1, 'road': 1, ...})

In [45]:
# To find the frequency of top 10 words
fdist1 = fdist.most_common(10)
fdist1

[('the', 3),
 ('Brazil', 2),
 ('on', 2),
 ('of', 2),
 ('In', 1),
 ('they', 1),
 ('drive', 1),
 ('right-hand', 1),
 ('side', 1),
 ('road', 1)]

Rare word removal
This is very intuitive, as some of the words that are very unique in nature like names, brands, product names, and some of the noise characters, such as html leftouts, also need to be removed for different NLP tasks. We also use a length of the words as a criteria for removing words with very a short length or a very long length

In [13]:
# Removal of 10 rare words and store into new column called 'text_rare'
freq = pd.Series(' '.join(df['text_common']).split()).value_counts()[-10:] # 10 rare words
freq = list(freq.index)
df['text_rare'] = df['text_common'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['text_rare'].head()

0    rssurjewala critical question paytm informed e...
1                            hemant_80 vote survey app
2    roshankar former finsec rbi dy governor cbdt c...
3    ani_news gurugram haryana post office employee...
4    satishacharya reddy wedding mail_today cartoon...
Name: text_rare, dtype: object

Spelling Correction
Social media data always messy data and it has spelling mistakes. Hence, spelling correction is a useful pre-processing step because this will help us to avoid multiple words. Example, “text” and “txt” will be treated as different words even if they are used in the same sense. This can be done by textblob library


In [14]:
# Spell check using text blob for the first 5 records
from textblob import TextBlob
df['text_rare'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    rssurjewala critical question part informed ed...
1                             hemant_80 vote survey pp
2    roshankar former finsen roi dy governor but ch...
3    ani_news gurugram havana post office employees...
4    satishacharya ready wedding mail_today cartoon...
Name: text_rare, dtype: object

In [62]:
# Function to remove emoji.
def emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
emoji("Hi, I am Emoji 😉")


'Hi, I am Emoji '

In [63]:
#passing the emoji function to 'text_rare'
df['text_rare'] = df['text_rare'].apply(emoji)

In [64]:
#!pip install emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
# Function for removing emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)
remove_emoticons("Hello :-)")


'Hello '

In [65]:
# applying remove_emoticons to 'text_rare'
df['text_rare'] = df['text_rare'].apply(remove_emoticons)

Converting Emoji and Emoticons to words
In sentiment analysis, emojis and emoticons express an emotion. Hence, removing them might not be a good solution.

In [46]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
# Converting emojis to words

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text
# Function for converting emoticons into word
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

In [47]:
# Example
text = "Hello :-) :-)"
convert_emoticons(text)


'Hello Happy_face_smiley Happy_face_smiley'

In [48]:
text1 = "Hilarious 😂"
convert_emojis(text1)

'Hilarious face_with_tears_of_joy'

In [49]:
# Passing both functions to 'text_rare'
df['text_rare'] = df['text_rare'].apply(convert_emoticons)
df['text_rare'] = df['text_rare'].apply(convert_emojis)

Removal of URL’s
Removing URLs in the text. We can use Beautiful soup library


In [50]:
# Function for url's
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


In [51]:

# Examples
text = "This is my website, https://www.abc.com"
remove_urls(text)



'This is my website, '

In [52]:

#Passing the function to 'text_rare'
df['text_rare'] = df['text_rare'].apply(remove_urls)

Removal of HTML tags
Another common preprocessing technique is removing HTML tags. HTML tags usually presented in scraping data.

In [58]:
from bs4 import BeautifulSoup
#Function for removing html
def html(text):
    return BeautifulSoup(text, "lxml").text
# Examples
text = """<div>
<h1> This</h1>
<p> is</p>
<a href="https://www.abc.com/"> ABCD</a>
</div>
"""
print(html(text))



 This
 is
 ABCD




In [59]:
# Passing the function to 'text_rare'
df['text_rare'] = df['text_rare'].apply(html)

Stemming and Lemmatization
Lemmatization is the process of converting a word to its base form. The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors. Here, lemmatization only performed. We need to provide the POS tag of the word along with the word for lemmatizer in NLTK. Depending on the POS, the lemmatizer may return different results.

In [57]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} # Pos tag, used Noun, Verb, Adjective and Adverb
# Function for lemmatization using POS tag
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
# Passing the function to 'text_rare' and store in 'text_lemma'
df["text_lemma"] = df["text_rare"].apply(lemmatize_words)