# Text Preprocessing

In [2]:
import pandas as pd

## import dataset

In [5]:
url='https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Lowercasing string

In [10]:
df['review']=df['review'].str.lower()
df.head(3)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive


# Removing HTML Tags from the text

In [13]:
import re
def removeHTMLTags(text):
    pattern = r'<.*?>'
    return (re.sub(pattern,'',text))

In [14]:
df['review']=df['review'].apply(removeHTMLTags)
df.head(3)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive


## Removing URLs

In [25]:
pattern ='https?:\S+|www\.\S+'
tmp=df[df['review'].str.contains(pattern)]['review']
for st in tmp:
    print(st,end='\n\n')

mario lewis of the competitive enterprise institute has written a definitive 120-page point-by-point, line-by-line refutation of this mendacious film, which should be titled a convenient lie. the website address where his debunking report, which is titled "a skeptic's guide to an inconvenient truth" can be found at is :www.cei.org. a shorter 10-page version can be found at: www.cei.org/pdf/5539.pdf once you read those demolitions, you'll realize that alleged "global warming" is no more real or dangerous than the y2k scare of 1999, which gore also endorsed, as he did the pseudo-scientific film the day after tomorrow, which was based on a book written by alleged ufo abductee whitley strieber. as james "the amazing" randi does to psychics, and philip klass does to ufos, and gerald posner does to jfk conspir-idiocy theories, so does mario lewis does to al gore's movie and the whole "global warming" scam.

following directly from where the story left off in part one, the second half which s

In [26]:
def removeURLs(text):
    pattern='https?:\S+|www\.\S+'
    return re.sub(pattern, '',text)

In [29]:
df['review']=df['review'].apply(removeURLs)

In [30]:
tmp=df[df['review'].str.contains(pattern)]['review']
for st in tmp:
    print(st,end='\n\n')

## Remove Punctuation

In [31]:
import string,time

In [35]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
def remove_punc(text):
  for c in exclude:
     text=text.replace(c,'')
  return text
t1=time.time()
remove_punc('hello, world!++++++,ok bos.wwljf .')
t2=time.time()
print((t2-t1)*50000)

6.735324859619141


this is not efficient for a big dataset. There is another way which is faster than the previous one

In [48]:
def remove_punc1(text):
  return text.translate(str.maketrans('','',exclude))
t1=time.time()
remove_punc1('hello, world!++++++,ok bos.wwljf .')
t2=time.time()
print((t2-t1)*50000)

5.519390106201172


In [54]:
t1=time.time()
df['review'].apply(remove_punc1)
t2=time.time()
print(t2-t1)

1.2471754550933838


In [55]:
t1=time.time()
df['review'].apply(remove_punc)
t2=time.time()
print(t2-t1)

1.8608641624450684


In [56]:
df['review']=df['review'].apply(remove_punc)

# Spelling Correction
beacuse of spelling mistake our model may treat same word as two different word. that may create our model more complex. As a result, the performance may be reduced

In [57]:
from textblob import TextBlob

In [63]:
import textblob
incorrectText="here I am learning English. hier I m lerning English"
textBlb = TextBlob(incorrectText)
textBlb.correct().string

'here I am learning English. her I m leaning English'

## Removing Stop words
**stop words** a,the, are, an,

In [68]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [71]:
from nltk.corpus import stopwords

In [76]:
len(list(stopwords.words("german"))),len(list(stopwords.words("english"))),len(list(stopwords.words("spanish")))

(232, 179, 313)

In [80]:
def remove_stopwords(text):
  stopWords=list(stopwords.words('english'))
  newText=[]
  for word in text.split():
     if word not in stopWords:
        newText.append(word)
  return ' '.join(newText)
print(remove_stopwords("my name is Rana. I do not like to go to work"))


name Rana. I like go work


In [81]:
df['review']=df['review'].apply(remove_stopwords)
df.head(4)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative


# Handling Emojis
+ remove
+ replace

# using u-code

In [83]:
text = u'This is a smiley face \U0001f602\U0001F680'
print(text) # with emoji

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

print(deEmojify(text))

This is a smiley face 😂🚀
This is a smiley face 


# using **emoji** module from python to replace emoji with it's meaning

In [85]:
!pip install emoji
import emoji


Collecting emoji
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.8.0


In [86]:
print("python is :❤️")
print(emoji.demojize("python is :❤️"))

python is :❤️
python is ::red_heart:


## Tokenization
+ this is a processs of breaking the text doc into smaller parts(words, sentence). mostly wods tokenization is used.


## why Tokenization
+ Text Segmentation
+ Feature Extraction
+ Text Cleaning
+ Stop Word Removal
+ Text Normalization
+ Machine Learning Input


# Tokenization using ```split()``` method





In [89]:
"Hello world".split() # word level

['Hello', 'world']

In [88]:
"I learn python. today is Friday".split('.')

['I learn python', ' today is Friday']

## Problems with ```split()``` method

In [90]:
"Hello world!".split() # couldn't remove '!'

['Hello', 'world!']

In [91]:
"What do you learn? I learn python".split('.')# couldn't split

['What do you learn? I learn python']

# Using Regular Expression

In [95]:
re.findall('[\w]+','I am going to kill you!')

['I', 'am', 'going', 'to', 'kill', 'you']

In [99]:
re.compile('[.,?!\']').split("are you sure?no!okay let us talk,chill")

['are you sure', 'no', 'okay let us talk', 'chill']

# Using ``` NLTK``` libray

In [102]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [103]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [104]:
word_tokenize("I am going to kill you!")

['I', 'am', 'going', 'to', 'kill', 'you', '!']

In [106]:
sent_tokenize("are you sure? no! okay let us talk, chill")

['are you sure?', 'no!', 'okay let us talk, chill']

In [108]:
word_tokenize("this costs $10.8")

['this', 'costs', '$', '10.8']

In [109]:
word_tokenize("I have a PhD in A.I")

['I', 'have', 'a', 'PhD', 'in', 'A.I']

## Using ```Spacy``` libray.
this is the best approach

In [111]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [116]:
doc=nlp('I have a Ph.d in A.I from U.S.A! jewelbutex12@gmail.com')
for token in doc:
  print(token)

I
have
a
Ph.d
in
A.I
from
U.S.A
!
jewelbutex12@gmail.com


# Inflection
In grammer, inflection is the modification of a word to express different grammatical categories such as tense, case, voice, aspect, number, gender, mode.

# Stemming
this is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the language.

# stemming using ```NLTK``` library

In [117]:
from nltk.stem.porter import PorterStemmer

In [118]:
ps= PorterStemmer()
def stem_word(text):
   return ' '.join([ps.stem(word) for word in text.split()])

In [119]:
stem_word('walk walking walked walks')

'walk walk walk walk'

In [120]:
ps.stem("probably") # not a english word. problem of stemming

'probabl'

# Lemmatization
unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. In Lemmatization root word is called **Lemma**. A lemma is the canonical form, dictionary form or citation form of a set of words.

In [123]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemma=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [130]:
wordnet_lemma.lemmatize("walked",pos='v')

'walk'

# Some more example of Stemming

In [132]:
example_words = ["program","programming","programer","programs","programmed"]
print("{0:20}{1:20}".format("--word--","--stem--"))
for word in example_words:
    print("{0:20}{1:20}".format(word,ps.stem(word)))


--word--            --stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


In [133]:
example_sentence = "Python programmers often tend like programming in python because it's like english. We call people who program in python pythonistas."
# Remove punctuation
example_sentence_no_punct=example_sentence.translate(str.maketrans('','',string.punctuation))
# create tokens
word_tokens= word_tokenize(example_sentence_no_punct)
word_tokens


['Python',
 'programmers',
 'often',
 'tend',
 'like',
 'programming',
 'in',
 'python',
 'because',
 'its',
 'like',
 'english',
 'We',
 'call',
 'people',
 'who',
 'program',
 'in',
 'python',
 'pythonistas']

In [138]:
print("{0:20}{1:20}".format("--word--","--stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word,ps.stem(word)))

--word--            --stem--            
Python              python              
programmers         programm            
often               often               
tend                tend                
like                like                
programming         program             
in                  in                  
python              python              
because             becaus              
its                 it                  
like                like                
english             english             
We                  we                  
call                call                
people              peopl               
who                 who                 
program             program             
in                  in                  
python              python              
pythonistas         pythonista          


# Some more Example of Lemmatization

In [135]:
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()


In [137]:
print("{0:20}{1:20}".format("--word--","--stem--"))
for word in example_words:
    print("{0:20}{1:20}".format(word,wnl.lemmatize(word,pos='v')))

--word--            --stem--            
program             program             
programming         program             
programer           programer           
programs            program             
programmed          program             


In [139]:
print("{0:20}{1:20}".format("--word--","--stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word,wnl.lemmatize(word,pos='v')))

--word--            --stem--            
Python              Python              
programmers         programmers         
often               often               
tend                tend                
like                like                
programming         program             
in                  in                  
python              python              
because             because             
its                 its                 
like                like                
english             english             
We                  We                  
call                call                
people              people              
who                 who                 
program             program             
in                  in                  
python              python              
pythonistas         pythonistas         
