####  Import Required libraries

In [1]:
import pandas as pd
import numpy as np 
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk
import string
import spacy

#### Loading a Dataset 

In [2]:
dataset = pd.read_csv("C:/Users/Admin/NLP_Project/Movies_review_dataset/IMDB Dataset.csv")
dataset = dataset.iloc[:50]
print(dataset)

                                               review sentiment
0   One of the other reviewers has mentioned that ...  positive
1   A wonderful little production. <br /><br />The...  positive
2   I thought this was a wonderful way to spend ti...  positive
3   Basically there's a family where a little boy ...  negative
4   Petter Mattei's "Love in the Time of Money" is...  positive
5   Probably my all-time favorite movie, a story o...  positive
6   I sure would like to see a resurrection of a u...  positive
7   This show was an amazing, fresh & innovative i...  negative
8   Encouraged by the positive comments about this...  negative
9   If you like original gut wrenching laughter yo...  positive
10  Phil the Alien is one of those quirky films wh...  negative
11  I saw this movie when I was about 12 when it c...  negative
12  So im not a big fan of Boll's work but then ag...  negative
13  The cast played Shakespeare.<br /><br />Shakes...  negative
14  This a fantastic movie of three pris

#### Convert Uper case into lower case of dataset

In [3]:
dataset['review'] = dataset['review'].str.lower()
# print(dataset)

#### Remove HTML tags from text data ( Review column )

In [4]:
def remove_htmltag(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

dataset['review'] = dataset['review'].apply(remove_htmltag)
# print(dataset)

#### Remove URLS or URLS tags from text data ( Review column )

In [5]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

dataset['review'] = dataset['review'].apply(remove_url)
# print(dataset)

#### Remove Puntuation from text data ( Review column ) 

In [6]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','', string.punctuation))

dataset['review'] = dataset['review'].apply(remove_punctuation)
print(dataset)

                                               review sentiment
0   one of the other reviewers has mentioned that ...  positive
1   a wonderful little production the filming tech...  positive
2   i thought this was a wonderful way to spend ti...  positive
3   basically theres a family where a little boy j...  negative
4   petter matteis love in the time of money is a ...  positive
5   probably my alltime favorite movie a story of ...  positive
6   i sure would like to see a resurrection of a u...  positive
7   this show was an amazing fresh  innovative ide...  negative
8   encouraged by the positive comments about this...  negative
9   if you like original gut wrenching laughter yo...  positive
10  phil the alien is one of those quirky films wh...  negative
11  i saw this movie when i was about 12 when it c...  negative
12  so im not a big fan of bolls work but then aga...  negative
13  the cast played shakespeareshakespeare losti a...  negative
14  this a fantastic movie of three pris

In [7]:
def spellchecker(text):
    new_df = TextBlob(text)
    return new_df.correct().string

dataset['review'] = dataset['review'].apply(spellchecker)
print(dataset)

                                               review sentiment
0   one of the other reviews has mentioned that af...  positive
1   a wonderful little production the filling tech...  positive
2   i thought this was a wonderful way to spend ti...  positive
3   basically there a family where a little boy ja...  negative
4   letter matters love in the time of money is a ...  positive
5   probably my alliee favorite movie a story of h...  positive
6   i sure would like to see a resurrection of a u...  positive
7   this show was an amazing fresh  innovative ide...  negative
8   encouraged by the positive comments about this...  negative
9   if you like original gut wrenching laughter yo...  positive
10  phil the alien is one of those quickly films w...  negative
11  i saw this movie when i was about 12 when it c...  negative
12  so in not a big fan of balls work but then aga...  negative
13  the cast played shakespeareshakespeare lost ap...  negative
14  this a fantastic movie of three pris

In [8]:
# stopwords.words('english')

In [9]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append(" ")
        else:
            new_text.append(word)
    return ' '.join(new_text)

dataset['review'] = dataset['review'].apply(remove_stopwords)
print(dataset)

                                               review sentiment
0   one       reviews   mentioned     watching   1...  positive
1     wonderful little production   filling techni...  positive
2     thought       wonderful way   spend time    ...  positive
3   basically     family     little boy jake think...  negative
4   letter matters love     time   money     usual...  positive
5   probably   alliee favorite movie   story   hel...  positive
6     sure would like   see   resurrection       d...  positive
7     show     amazing fresh innovative idea      ...  negative
8   encouraged     positive comments     film     ...  negative
9       like original gut wrenching laughter     l...  positive
10  phil   alien   one     quickly films     humou...  negative
11    saw   movie         12     came     recall  ...  negative
12          big fan   balls work         many     ...  negative
13    cast played shakespeareshakespeare lost appr...  negative
14      fantastic movie   three prisoner

In [10]:
# def words_token(text):
#     token = spacy.load('en_core_web_sm')
#     return token(text)

# dataset['review'] = dataset['review'].apply(words_token)
# print(dataset)

In [11]:
def stem_words(text):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

dataset['review'] = dataset['review'].apply(stem_words)
print(dataset)

                                               review sentiment
0   one review mention watch 1 oz episod hook righ...  positive
1   wonder littl product fill techniqu assum oldti...  positive
2   thought wonder way spend time hot summer weeke...  positive
3   basic famili littl boy jake think combin close...  negative
4   letter matter love time money usual stun film ...  positive
5   probabl allie favorit movi stori helpless sacr...  positive
6   sure would like see resurrect date seahunt ser...  positive
7   show amaz fresh innov idea first tire first 7 ...  negative
8   encourag posit comment film look forward watch...  negative
9   like origin gut wrench laughter like movi youn...  positive
10  phil alien one quickli film humour base around...  negative
11  saw movi 12 came recal cari scene big bird eat...  negative
12  big fan ball work mani enjoy movi postal mayb ...  negative
13  cast play shakespeareshakespear lost appreci t...  negative
14  fantast movi three prison becom famo

In [12]:
def words_token(text):
    token = spacy.load('en_core_web_sm')
    return token(text)

dataset['review'] = dataset['review'].apply(words_token)
print(dataset)

                                               review sentiment
0   (one, review, mention, watch, 1, oz, episod, h...  positive
1   (wonder, littl, product, fill, techniqu, assum...  positive
2   (thought, wonder, way, spend, time, hot, summe...  positive
3   (basic, famili, littl, boy, jake, think, combi...  negative
4   (letter, matter, love, time, money, usual, stu...  positive
5   (probabl, allie, favorit, movi, stori, helples...  positive
6   (sure, would, like, see, resurrect, date, seah...  positive
7   (show, amaz, fresh, innov, idea, first, tire, ...  negative
8   (encourag, posit, comment, film, look, forward...  negative
9   (like, origin, gut, wrench, laughter, like, mo...  positive
10  (phil, alien, one, quickli, film, humour, base...  negative
11  (saw, movi, 12, came, recal, cari, scene, big,...  negative
12  (big, fan, ball, work, mani, enjoy, movi, post...  negative
13  (cast, play, shakespeareshakespear, lost, appr...  negative
14  (fantast, movi, three, prison, becom

In [13]:
nlp = spacy.load("en_core_web_sm")

def lemma(text):
    doc = nlp(text)
    lemmatized_tokens = [] 
    for token in doc:
        if token.pos_ == "VERB":
            lemmatized_tokens.append(token.lemma_)
        else:
            lemmatized_tokens.append(token.text)
    return " ".join(lemmatized_tokens)

dataset['review'] = dataset['review'].apply(lemma)
print(dataset)
    

                                               review sentiment
0   one review mention watch 1 oz episod hook righ...  positive
1   wonder littl product fill techniqu assum oldti...  positive
2   think wonder way spend time hot summer weekend...  positive
3   basic famili littl boy jake think combin close...  negative
4   letter matter love time money usual stun film ...  positive
5   probabl allie favorit movi stori helpless sacr...  positive
6   sure would like see resurrect date seahunt ser...  positive
7   show amaz fresh innov idea first tire first 7 ...  negative
8   encourag posit comment film look forward watch...  negative
9   like origin gut wrench laughter like movi youn...  positive
10  phil alien one quickli film humour base around...  negative
11  see movi 12 come recal cari scene big bird eat...  negative
12  big fan ball work mani enjoy movi postal mayb ...  negative
13  cast play shakespeareshakespear lose appreci t...  negative
14  fantast movi three prison becom famo

In [14]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [15]:
enc = enc.fit_transform(dataset[['review']]).toarray()
print(enc)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


In [16]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(ngram_range=(1,3))

In [17]:
# new = cv.fit_transform(dataset['review'])
# print(new)

In [18]:
# print(cv.vocabulary_)

In [19]:
# print(new[0:].toarray())

In [20]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# tfidf.fit_transform(dataset['review']).toarray()

In [21]:
# print(tfidf.idf_)
# print(tfidf.get_feature_names_out())