In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import classification_report,accuracy_score
import re
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/devjoao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('./data/IMDB-Review.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [5]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [7]:
#Selecting Sample
X = df['review'][:2000]
y = df['sentiment'][:2000]

In [8]:
# Splitting the sample into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.333, random_state = 0)

In [9]:
X_train.shape

(1334,)

In [10]:
#Tokenization
tk = ToktokTokenizer()

#English stopwords
english_stops = set(stopwords.words('english'))

In [13]:
#Text Cleaning
def strip_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def clean_character(text):
    pattern = r'[^a-zA-z0-9\s]'
    return re.sub(pattern,'',text)

def clean_content(text):
    text = strip_html(text)
    text = remove_square_brackets(text)
    text = clean_character(text)
    return text

X_train = X_train.apply(clean_content)
X_test = X_test.apply(clean_content)
X_train

717     I want very much to believe that the above quo...
1649    This is an interesting movie I think its very ...
1079    I have to say the first I watched this film wa...
1983    I saw this with few expectations and absolutel...
542     The first Cube movie was an art movie It set u...
                              ...                        
835     The story has been told before A deadly diseas...
1216    Mickey Rourke hunts Diane Lane in Elmore Leona...
1653    Yeah that about sums it up This movie was horr...
559     So I rented this from Netflix because somebody...
684     The perfect murder is foiled when a wifeplayed...
Name: review, Length: 1334, dtype: object

In [14]:
#Stemming
def stemmer(text):
    port = PorterStemmer()
    text = ' '.join([port.stem(word) for word in text.split()])
    return text

#Apply function on review column
X_train = X_train.apply(stemmer)
X_test = X_test.apply(stemmer)
X_train

717     I want veri much to believ that the abov quot ...
1649    thi is an interest movi I think it veri humor ...
1079    I have to say the first I watch thi film wa ab...
1983    I saw thi with few expect and absolut love it ...
542     the first cube movi wa an art movi It set up a...
                              ...                        
835     the stori ha been told befor A deadli diseas i...
1216    mickey rourk hunt dian lane in elmor leonard k...
1653    yeah that about sum it up thi movi wa horrifi ...
559     So I rent thi from netflix becaus somebodi gav...
684     the perfect murder is foil when a wifeplay by ...
Name: review, Length: 1334, dtype: object

In [15]:
#Stopwords with Gensim lib
def remove_with_gensim(text):
    return remove_stopwords(text)

X_train = X_train.apply(remove_with_gensim)
X_test = X_test.apply(remove_with_gensim)

X_train

717     I want veri believ abov quot specif english su...
1649    thi movi I think veri humor humor veri black f...
1079    I I watch thi film wa 6 year ago I actual enjo...
1983    I saw thi expect absolut love bend like beckha...
542     cube movi wa art movi It set world major arche...
                              ...                        
835     stori ha told befor A deadli diseas spread ext...
1216    mickey rourk hunt dian lane elmor leonard kill...
1653    yeah sum thi movi wa horrifi minut I want goug...
559     So I rent thi netflix becaus somebodi gave rog...
684     perfect murder foil wifeplay mari ellen traino...
Name: review, Length: 1334, dtype: object

In [16]:
#Tfidf vectorizer
tf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

#transformed train reviews
train_reviews=tf.fit_transform(X_train)

#transformed test reviews
test_reviews=tf.transform(X_test)

print('Tfidf_train:',train_reviews.shape)
print('Tfidf_test:',test_reviews.shape)

Tfidf_train: (1334, 260479)
Tfidf_test: (666, 260479)


In [17]:
#Logistc Regressor
regressor = LogisticRegression(penalty='l2',max_iter=400,C=1,random_state=42)

#Fitting the model for TF
regressor_bow = regressor.fit(train_reviews, y_train)
print(regressor_bow)
#Fitting the model for tfidf features
lr_tfidf = regressor.fit(train_reviews,y_train)
print(lr_tfidf)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=400, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=400, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)




In [18]:
#Predicting the model for TF
predict = regressor.predict(test_reviews)
print(predict)

['positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'negative'
 'positive' 'positive' 'negative' 'negative' 'negative' 'positive'
 'positive' 'positive' 'positive' 'positive' 'negative' 'positive'
 'negative' 'positive' 'positive' 'positive' 'negative' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'negative' 'positive'
 'positive' 'positive' 'negative' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'negative' 'positive' 'negative'
 'positive' 'positive' 'negative' 'positive' 'negative' 'negative'
 'positive' 'positive' 'positive' 'negative' 'negative' 'positive'
 'positive' 'positive' 'positive' 'negative' 'positive' 'positive'
 'positive' 'negative' 'negative' 'positive' 'positive' 'posit

In [19]:
#Accuracy score for TF
score = accuracy_score(y_test,predict)
print("lr_tfidf_score :",score)

lr_tfidf_score : 0.6216216216216216


In [20]:
#Summary Report
report = classification_report(y_test, predict, target_names=['Positive','Negative'])
print(report)

              precision    recall  f1-score   support

    Positive       0.79      0.35      0.48       339
    Negative       0.57      0.91      0.70       327

   micro avg       0.62      0.62      0.62       666
   macro avg       0.68      0.63      0.59       666
weighted avg       0.68      0.62      0.59       666

