## This Notebook is made to demonstarte the code for the article Text Cleaning NLP (Patr-2)
To read full article follow this link here

## Removing HTML elements

In [1]:
import re

text = """ <body>
<h1>I had such high hopes for this dress 15 size or (my usual size) to work for me.</h1>
</body>
"""
without_html = re.sub(pattern=r'<.*?>', repl=' ', string=text)
print(f"{without_html}")

#Output
#I had such high hopes for this dress 15 size or (my usual size) to work for me. 

  
 I had such high hopes for this dress 15 size or (my usual size) to work for me. 
 



## Removing & Finding URL


### Finding the URL from the text

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = 'My email is http://abcgmail.com'
doc = nlp(text)
for token in doc:
    if token.like_url:
        print(token)

#Output
#http://abcgmail.com

http://abcgmail.com


### Removing the URL from the text

In [3]:
text = 'Look at this link http://abcgmail.com for work purpose https://abd.com'
text_sp = text.split()
ans = ' '.join([i for i in text_sp if 'ht' not in i])
ans

#Output
#Look at this link for work purpose

'Look at this link for work purpose'

## Removing & Finding Email id

### Finding if the Email id from the text

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = 'My email is abc@gmail.com'
doc = nlp(text)
for token in doc:
    if token.like_email:
        print(token)

#Output
#abc@gmail.com

abc@gmail.com


### Removing the Email id from the text

In [5]:
text = 'My email is abc@gmail.com for work purpose'
text_sp = text.split()
ans = ' '.join([i for i in text_sp if '@' not in i])
ans

#Output
#My email is

'My email is for work purpose'

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = 'My email is abc@gmail.com'
doc = nlp(text)
for token in doc:
    if not token.like_email:
        print(token)

#Output
#My email is

My
email
is


## Removing Stop Words

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

text = "I had such high hopes for this dress 1-5 size to work for me." 
STOPWORDS = set(stopwords.words('english'))
ans = " ".join([word for word in str(text).split() if word not in STOPWORDS])
ans

#Output
#I high hopes dress 1-5 size work me.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'I high hopes dress 1-5 size work me.'

In [8]:
# We can see which words are stop words
print(nlp.Defaults.stop_words)

{'ours', 'becomes', 'with', 'mine', 'everything', 'same', 'still', 'was', 'his', 'moreover', 'myself', 'made', 'who', 'should', 'is', 'are', 'amongst', 'three', 'for', 'where', 'further', 'during', 'every', 'nine', 'anyone', 'between', 'us', 'regarding', 'your', 'any', 'becoming', 'must', 'up', 'eleven', 'n‘t', 'why', 'whether', 'yet', 'such', 'used', 'no', 'afterwards', 'show', 'five', 'may', "'m", 'nobody', 'almost', 'these', '‘re', 'take', 'bottom', 'one', 'meanwhile', 'either', 'under', 'down', 'this', "'ll", 'until', 'namely', 'hers', 'she', 'everywhere', 'each', 'therein', 'whole', 'him', 'very', 'empty', '‘ll', 're', "'re", 'here', 'out', 'noone', 'yours', 'latter', 'yourselves', 'from', 'thereupon', 'something', 'whereupon', 'although', 'get', 'all', 'even', 'however', 'ten', 'others', 'can', 'became', 'make', 'my', 'fifty', 'just', 'third', 'nor', 'itself', 'at', 'since', 'upon', 'none', 'what', 'whenever', 'thus', 'fifteen', 'only', 'whence', 'someone', 'side', 'many', 'put',

In [9]:
STOP_WORDS |= {"1-5"}

print(STOP_WORDS) #checking the word "1-5" is added to the list or not

{'ours', 'becomes', 'with', 'mine', 'everything', 'same', 'still', 'was', 'his', 'moreover', 'myself', 'made', 'who', 'should', 'is', 'are', 'amongst', 'three', 'for', 'where', 'further', 'during', 'every', 'nine', 'anyone', 'between', 'us', 'regarding', 'your', 'any', 'becoming', 'must', 'up', 'eleven', 'n‘t', 'why', 'whether', 'yet', 'such', 'used', 'no', 'afterwards', 'show', 'five', 'may', "'m", 'nobody', 'almost', 'these', '‘re', 'take', 'bottom', 'one', 'meanwhile', 'either', 'under', 'down', 'this', "'ll", 'until', 'namely', 'hers', 'she', 'everywhere', 'each', 'therein', 'whole', 'him', 'very', 'empty', '‘ll', 're', "'re", 'here', 'out', 'noone', 'yours', 'latter', 'yourselves', 'from', 'thereupon', 'something', 'whereupon', 'although', 'get', 'all', 'even', 'however', 'ten', 'others', 'can', 'became', 'make', 'my', 'fifty', 'just', 'third', 'nor', 'itself', 'at', 'since', 'upon', 'none', 'what', 'whenever', 'thus', 'fifteen', 'only', 'whence', 'someone', 'side', 'many', 'put',

In [10]:
text = "I had such high hopes for this dress 1-5 size to work for me." 
ans = " ".join([word for word in str(text).split() if word not in STOP_WORDS])
ans

#Output
#I high hopes dress size work me.

'I high hopes dress size work me.'

## Standardizing and Spell Check

In [11]:
!pip install autocorrect



In [12]:
import itertools
from autocorrect import Speller

text="A farmmer will lovdd this food"
#One letter in a word should not be present more than twice in continuation
text_correction = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
print("Normal Text:\n{}".format(text_correction))

spell = Speller(lang='en')
ans = spell(text_correction)
print("After correcting text:\n{}".format(ans))

#Output
#Normal Text:
#A farmmer will lovdd this food
#After correcting text:
#A farmer will loved this food

Normal Text:
A farmmer will lovdd this food
After correcting text:
A farmer will loved this food


## Chat Words Conversion

In [13]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace"""

In [14]:
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

chat_words_conversion("one minute A3")

'one minute Anytime, Anywhere, Anyplace'

## Remove the frequent words

In [15]:
from google.colab import files
uploaded = files.upload()

Saving NLP cleaning part-2.csv to NLP cleaning part-2.csv


In [16]:
import pandas as pd
df =  pd.read_csv('NLP cleaning part-2.csv')
df[:3]

Unnamed: 0,text
0,"Monica and the gang introduce Rachel to the ""r..."
1,Ross finds out his ex-wife is pregnant. Rachel...
2,Monica becomes irritated when everyone likes h...


In [17]:
from collections import Counter
cnt = Counter()
for text in df["text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('a', 36),
 ('the', 23),
 ('to', 23),
 ('and', 22),
 ('Ross', 15),
 ('his', 15),
 ('Chandler', 14),
 ('with', 14),
 ('Monica', 13),
 ('Rachel', 12)]

In [18]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,text,text_wo_stopfreq
0,"Monica and the gang introduce Rachel to the ""r...","gang introduce ""real world"" after she leaves h..."
1,Ross finds out his ex-wife is pregnant. Rachel...,finds out ex-wife is pregnant. returns her eng...
2,Monica becomes irritated when everyone likes h...,becomes irritated when everyone likes her new ...
3,Joey and Chandler take Ross to a hockey game t...,Joey take hockey game take mind off anniversar...
4,"Eager to spend time with Rachel, Ross pretends...","Eager spend time Rachel, pretends washroom is ..."


## Removing the less frequent words

In [19]:
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_wo_stopfreqrare"] = df["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,text,text_wo_stopfreq,text_wo_stopfreqrare
0,"Monica and the gang introduce Rachel to the ""r...","gang introduce ""real world"" after she leaves h...","gang introduce ""real world"" after she leaves h..."
1,Ross finds out his ex-wife is pregnant. Rachel...,finds out ex-wife is pregnant. returns her eng...,finds out ex-wife is pregnant. returns her eng...
2,Monica becomes irritated when everyone likes h...,becomes irritated when everyone likes her new ...,becomes irritated when everyone likes her new ...
3,Joey and Chandler take Ross to a hockey game t...,Joey take hockey game take mind off anniversar...,Joey take hockey game take mind off anniversar...
4,"Eager to spend time with Rachel, Ross pretends...","Eager spend time Rachel, pretends washroom is ...","Eager spend time Rachel, pretends washroom is ..."
