In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df = temp_df.iloc[:1000]

In [4]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


## Taking the basic underStanding of Data

In [6]:
df['sentiment'].value_counts()

sentiment
positive    501
negative    499
Name: count, dtype: int64

In [9]:
import seaborn as sns

In [None]:
sns.distplot(df['sentiment'])

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df.duplicated().sum()

0

## Initiate the Preprocessing

In [10]:
# Lowercasing
# remove tags
# removing stopwords

In [12]:
df['review'] = df['review'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x:x.lower())


In [14]:
import re

In [20]:
def remove_text(text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', text)
    return cleaned_text

In [22]:
df['review'] = df['review'].apply(remove_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_text)


In [23]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
995,nothing is sacred. just ask ernie fosselius. t...,positive
996,i hated it. i hate self-aware pretentious inan...,negative
997,i usually try to be professional and construct...,negative
998,if you like me is going to see this in a film ...,negative


In [24]:
from nltk.corpus import stopwords

In [26]:
sw_list = stopwords.words('English')

In [28]:
df['review'] = df['review'].apply(lambda x : [item for item in x.split() if item not in sw_list]).apply(lambda x :" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x : [item for item in x.split() if item not in sw_list]).apply(lambda x :" ".join(x))


In [29]:
df['review']

0      one reviewers mentioned watching 1 oz episode ...
1      wonderful little production. filming technique...
2      thought wonderful way spend time hot summer we...
3      basically there's family little boy (jake) thi...
4      petter mattei's "love time money" visually stu...
                             ...                        
995    nothing sacred. ask ernie fosselius. days, eve...
996    hated it. hate self-aware pretentious inanity ...
997    usually try professional constructive criticiz...
998    like going see film history class something li...
999    like zoology textbook, given depiction animals...
Name: review, Length: 1000, dtype: object

## Now the review datas are preprocessed now we need to move towards for Text vectorizations

In [31]:
X = df.drop('sentiment',axis=1)
y = df['sentiment']

In [32]:
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
995,"nothing sacred. ask ernie fosselius. days, eve..."
996,hated it. hate self-aware pretentious inanity ...
997,usually try professional constructive criticiz...
998,like going see film history class something li...


In [33]:
y

0      positive
1      positive
2      positive
3      negative
4      positive
         ...   
995    positive
996    negative
997    negative
998    negative
999    negative
Name: sentiment, Length: 1000, dtype: object

In [34]:
from sklearn.preprocessing import LabelEncoder

In [35]:
enc = LabelEncoder()

In [36]:
y = enc.fit_transform(y)

In [38]:
y.shape

(1000,)

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, x_test, Y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=20)

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
cv = CountVectorizer()

In [43]:
transformed_X_train = cv.fit_transform(X_train['review']).toarray()

In [45]:
transformed_x_test = cv.transform(x_test['review']).toarray()

In [46]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [47]:
gnb.fit(transformed_X_train,Y_train)

In [49]:
y_pred = gnb.predict(transformed_x_test)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.61

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(transformed_X_train,Y_train)
y_pred = rf.predict(transformed_x_test)
accuracy_score(y_test,y_pred)