In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


## Text Preprocessing

In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

### Convert review column into Lowercase

In [5]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [6]:
df['review'] = df['review'].str.lower()           # to convert in lower case

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Remove HTML Tags


In [8]:
import re   # regular expression

In [9]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

In [10]:
text= "<div><p>I watched this movie yesterday, and it was <b>amazing</b>!"

In [11]:
remove_html_tags(text)

'I watched this movie yesterday, and it was amazing!'

In [12]:
df['review'] = df['review'].apply(remove_html_tags)

In [13]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

### Remove URLs

In [14]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [15]:
text1 = "Check out my notebook https://www.kaggle.com/campusx/notebook12345"
text2 = "Visit our project at http://github.com/user/repo to get the code"
text3 = "Google search here www.google.com for more info"
text4 = "For documentation click https://docs.python.org/3/tutorial to learn Python"

In [16]:
print(remove_url(text1))
print(remove_url(text2))
print(remove_url(text3))
print(remove_url(text4))

Check out my notebook 
Visit our project at  to get the code
Google search here  for more info
For documentation click  to learn Python


### Remove Punctuation

In [17]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
exclude = string.punctuation

In [19]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [20]:
text = "string. text!, punctuation? /"

In [21]:
remove_punc(text)

'string text punctuation '

In [22]:
start = time.time()
print(remove_punc(text))
time1 = time.time()-start
print(time1)

string text punctuation 
0.00014591217041015625


This method is taking time to run. If dataset will large, this will take more time to run.

In [23]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

### Spelling Correction

In [29]:
from textblob import TextBlob

In [34]:
incorrect_text = "My name is Kahkashan. I m fram Utter Pradash. I jast Graduated from IIT Madrass"

text_blb = TextBlob(incorrect_text)
text_blb.correct().string

'By name is Kahkashan. I m from Utter Pradash. I just Graduated from IIT Madrass'

### Removing Stopwords

In [36]:
from nltk.corpus import stopwords

In [37]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [40]:
def remove_stopwords(text):
    new_txt = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_txt.append('')
        else:
            new_txt.append(word)
    x = new_txt[:]
    new_txt.clear()
    return " ".join(x)

In [42]:
remove_stopwords("Hello, World! How are you?, I am a serial Killer")

'Hello, World! How  you?, I   serial Killer'

In [43]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [44]:
df['review'] = df['review'].apply(remove_stopwords)

In [45]:
df['review']

0        one    reviewers  mentioned   watching  1 oz e...
1         wonderful little production.  filming techniq...
2         thought    wonderful way  spend time    hot s...
3        basically there's  family   little boy (jake) ...
4        petter mattei's "love   time  money"   visuall...
                               ...                        
49995     thought  movie    right good job.    creative...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997       catholic taught  parochial elementary schoo...
49998     going    disagree   previous comment  side  m...
49999     one expects  star trek movies   high art,   f...
Name: review, Length: 50000, dtype: object

### Handling Emoji 🌸

In [48]:
# To remove emoji
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [49]:
text = "I love Python 😍🐍🔥"
print(remove_emoji(text))

I love Python 


In [50]:
import emoji

# Replace emojis with their text description
def emoji_to_text(text):
    return emoji.demojize(text)

In [51]:
text = "I love Python 🐍"
converted_text = emoji_to_text(text)
print(converted_text)

I love Python :snake:


In [53]:
text = "Good morning! ☀️🌸🍵"
converted_text = emoji_to_text(text)
print(converted_text)

Good morning! :sun::cherry_blossom::teacup_without_handle:


---

# **Tokenization**

Tokenization is the process of breaking down text into smaller, meaningful units called tokens. These tokens can be:

* Words (word-level tokenization)

* Subwords (subword-level tokenization)

* Characters (character-level tokenization)

* Sentences (sentence-level tokenization)

Challenges:
* Prefix
* Suffix
* Infix
* Exception

### 1. Using Split Function

In [1]:
# word tokenization
sent1 = "I am going to Delhi"
sent1.split()

['I', 'am', 'going', 'to', 'Delhi']

In [4]:
# sentence tokenization
text = "I am going to Delhi. I will stay there for 3 days. Let\'s hope the trip to be great"
text.split('.')

['I am going to Delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [5]:
# problem with split function
sent = 'I am going to Delhi!'
sent.split()    

['I', 'am', 'going', 'to', 'Delhi!']

### 2. Regular Expression

In [6]:
import re

sent = 'I am going to Delhi!'
tokens = re.findall("[\w']+", sent)
tokens

['I', 'am', 'going', 'to', 'Delhi']

In [8]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sentence = re.compile('[.!?]').split(text)
sentence

['Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 " \nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book",
 '']

### 3. Using NLTk library

In [9]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [10]:
text = 'I am going to visit Delhi!'
word_tokenize(text)

['I', 'am', 'going', 'to', 'visit', 'Delhi', '!']

In [12]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [13]:
sent_1 = "I have a Ph.D in A.I"
sent_2 = "We're here to help! mail us at kashish34@gmail.com"
sent_3 = 'A 5km ride code $10.50'

In [14]:
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'Delhi']

In [18]:
word_tokenize(sent_2)    # not good

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'kashish34',
 '@',
 'gmail.com']

In [17]:
word_tokenize(sent_3)

['A', '5km', 'ride', 'code', '$', '10.50']

### 4. SpaCy Library

spaCy is a popular open-source library in Python for Natural Language Processing (NLP). It’s designed to help machines understand and process human language efficiently and accurately

In [19]:
import spacy

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
doc1 = nlp(sent_1)
doc2 = nlp(sent_2)
doc3 = nlp(sent_3)

In [23]:
for token in doc1:
    print(token)

I
have
a
Ph
.
D
in
A.I


In [24]:
for token in doc2:
    print(token)

We
're
here
to
help
!
mail
us
at
kashish34@gmail.com


In [25]:
for token in doc3:
    print(token)

A
5
km
ride
code
$
10.50


#### *  SpaCy library perform better than nltk in some case

## Stemming 

Stemming is a text preprocessing technique in Natural Language Processing (NLP) where words are reduced to their root form (called the stem) by chopping off suffixes or prefixes.

* running → run
* runner  → run
* easily  → easili
* flies   → fli


In [28]:
from nltk.stem.porter import PorterStemmer

In [32]:
ps = PorterStemmer()
def stem_words(text):
    return ' '.join([ps.stem(word) for word in text.split()])

In [33]:
sample = 'walk walks walking walked'
stem_words(sample)

'walk walk walk walk'

In [34]:
text = "Natural language processing involves analyzing, understanding, and generating human languages. Researchers are developing tools for processing texts, analyzing sentences, and identifying relationships between words. Running experiments and testing algorithms helps in improving the accuracy of these systems."
print(text)

Natural language processing involves analyzing, understanding, and generating human languages. Researchers are developing tools for processing texts, analyzing sentences, and identifying relationships between words. Running experiments and testing algorithms helps in improving the accuracy of these systems.


In [35]:
stem_words(text)

'natur languag process involv analyzing, understanding, and gener human languages. research are develop tool for process texts, analyz sentences, and identifi relationship between words. run experi and test algorithm help in improv the accuraci of these systems.'

### Lemmatization

Lemmatization is a text preprocessing technique in Natural Language Processing (NLP) where words are reduced to their base or dictionary form (called a lemma).


Unlike stemming, which just chops off prefixes or suffixes, lemmatization uses linguistic knowledge (vocabulary + grammar rules) to make sure the base form is a valid word.

In [36]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?;:!.,"

sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
He                  He                  
was                 wa                  
running             running             
and                 and                 
eating              eating              
at                  at                  
same                same                
time                time                
He                  He                  
has                 ha                  
bad                 bad                 
habit               habit               
of                  of                  
swimming            swimming            
after               after               
playing             playing             
long                long                
hours               hour                
in                  in                  
the                 the                 
Sun                 Sun                 
