In [1]:
import pandas as pd
import numpy as np 
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk
import string
import spacy

In [2]:
dataset = pd.read_excel("C:/Users/Admin/Documents/IMDB_Review_2.xlsx")
print(dataset)

                                               review sentiment
0   One of the other reviewers has mentioned that ...  positive
1   A wonderful little production. <br /><br />The...  positive
2   I thought this was a wonderful way to spend ti...  positive
3   Basically there's a family where a little boy ...  negative
4   Petter Mattei's "Love in the Time of Money" is...  positive
5   Probably my all-time favorite movie, a story o...  positive
6   I sure would like to see a resurrection of a u...  positive
7   This show was an amazing, fresh & innovative i...  negative
8   Encouraged by the positive comments about this...  negative
9   If you like original gut wrenching laughter yo...  positive
10  Phil the Alien is one of those quirky films wh...  negative
11  I saw this movie when I was about 12 when it c...  negative
12  So im not a big fan of Boll's work but then ag...  negative
13  The cast played Shakespeare.<br /><br />Shakes...  negative
14  This a fantastic movie of three pris

In [3]:
dataset['review'] = dataset['review'].str.lower()
# print(dataset)

In [4]:
def remove_htmltag(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

dataset['review'] = dataset['review'].apply(remove_htmltag)
# print(dataset)

In [5]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

dataset['review'] = dataset['review'].apply(remove_url)
# print(dataset)

In [6]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','', string.punctuation))

dataset['review'] = dataset['review'].apply(remove_punctuation)
print(dataset)

                                               review sentiment
0   one of the other reviewers has mentioned that ...  positive
1   a wonderful little production the filming tech...  positive
2   i thought this was a wonderful way to spend ti...  positive
3   basically theres a family where a little boy j...  negative
4   petter matteis love in the time of money is a ...  positive
5   probably my alltime favorite movie a story of ...  positive
6   i sure would like to see a resurrection of a u...  positive
7   this show was an amazing fresh  innovative ide...  negative
8   encouraged by the positive comments about this...  negative
9   if you like original gut wrenching laughter yo...  positive
10  phil the alien is one of those quirky films wh...  negative
11  i saw this movie when i was about 12 when it c...  negative
12  so im not a big fan of bolls work but then aga...  negative
13  the cast played shakespeareshakespeare losti a...  negative
14  this a fantastic movie of three pris

In [7]:
def spellchecker(text):
    new_df = TextBlob(text)
    return new_df.correct().string

dataset['review'] = dataset['review'].apply(spellchecker)
print(dataset)

                                               review sentiment
0   one of the other reviews has mentioned that af...  positive
1   a wonderful little production the filling tech...  positive
2   i thought this was a wonderful way to spend ti...  positive
3   basically there a family where a little boy ja...  negative
4   letter matters love in the time of money is a ...  positive
5   probably my alliee favorite movie a story of h...  positive
6   i sure would like to see a resurrection of a u...  positive
7   this show was an amazing fresh  innovative ide...  negative
8   encouraged by the positive comments about this...  negative
9   if you like original gut wrenching laughter yo...  positive
10  phil the alien is one of those quickly films w...  negative
11  i saw this movie when i was about 12 when it c...  negative
12  so in not a big fan of balls work but then aga...  negative
13  the cast played shakespeareshakespeare lost ap...  negative
14  this a fantastic movie of three pris

In [8]:
# stopwords.words('english')

In [9]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append(" ")
        else:
            new_text.append(word)
    return ' '.join(new_text)

dataset['review'] = dataset['review'].apply(remove_stopwords)
print(dataset)

                                               review sentiment
0   one       reviews   mentioned     watching   1...  positive
1     wonderful little production   filling techni...  positive
2     thought       wonderful way   spend time    ...  positive
3   basically     family     little boy jake think...  negative
4   letter matters love     time   money     usual...  positive
5   probably   alliee favorite movie   story   hel...  positive
6     sure would like   see   resurrection       d...  positive
7     show     amazing fresh innovative idea      ...  negative
8   encouraged     positive comments     film     ...  negative
9       like original gut wrenching laughter     l...  positive
10  phil   alien   one     quickly films     humou...  negative
11    saw   movie         12     came     recall  ...  negative
12          big fan   balls work         many     ...  negative
13    cast played shakespeareshakespeare lost appr...  negative
14      fantastic movie   three prisoner

In [10]:
# def words_token(text):
#     token = spacy.load('en_core_web_sm')
#     return token(text)

# dataset['review'] = dataset['review'].apply(words_token)
# print(dataset)

In [11]:
# def stem_words(text):
#     stemmer = PorterStemmer()
#     return ' '.join([stemmer.stem(word) for word in text.split()])

# dataset['review'] = dataset['review'].apply(stem_words)
# print(dataset)

In [12]:
 def words_token(text):
    token = spacy.load('en_core_web_sm')
    return token(text)

dataset['review'] = dataset['review'].apply(words_token)
print(dataset)

                                               review sentiment
0   (one,       , reviews,   , mentioned,     , wa...  positive
1   (  , wonderful, little, production,   , fillin...  positive
2   (  , thought,       , wonderful, way,   , spen...  positive
3   (basically,     , family,     , little, boy, j...  negative
4   (letter, matters, love,     , time,   , money,...  positive
5   (probably,   , alliee, favorite, movie,   , st...  positive
6   (  , sure, would, like,   , see,   , resurrect...  positive
7   (  , show,     , amazing, fresh, innovative, i...  negative
8   (encouraged,     , positive, comments,     , f...  negative
9   (    , like, original, gut, wrenching, laughte...  positive
10  (phil,   , alien,   , one,     , quickly, film...  negative
11  (  , saw,   , movie,         , 12,     , came,...  negative
12  (        , big, fan,   , balls, work,         ...  negative
13  (  , cast, played, shakespeareshakespeare, los...  negative
14  (    , fantastic, movie,   , three, 

In [14]:
nlp = spacy.load("en_core_web_sm")

def lemma(text):
    doc = nlp(text)
    lemmatized_tokens = [] 
    for token in doc:
        if token.pos_ == "VERB":
            lemmatized_tokens.append(token.lemma_)
        else:
            lemmatized_tokens.append(token.text)
    return " ".join(lemmatized_tokens)

dataset['review'] = dataset['review'].apply(lemma)
print(dataset)
    

                                               review sentiment
0   one         reviews     mention       watch   ...  positive
1       wonderful little production     filling te...  positive
2       think         wonderful way     spend time...  positive
3   basically       family       little boy jake t...  negative
4   letter matters love       time     money      ...  positive
5   probably     alliee favorite movie     story  ...  positive
6       sure would like     see     resurrection  ...  positive
7       show       amazing fresh innovative idea  ...  negative
8   encourage       positive comments       film  ...  negative
9         like original gut wrench laughter       ...  positive
10  phil     alien     one       quickly films    ...  negative
11      see     movie           12       come     ...  negative
12            big fan     balls work           man...  negative
13      cast play shakespeareshakespeare lose appr...  negative
14        fantastic movie     three pris