In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df = temp_df.iloc[:10000]

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(10000, 2)

In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()

17

In [8]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
df.duplicated().sum()

0

In [10]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [11]:
df['review'] = df['review'].apply(remove_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
df['review'] = df['review'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [14]:
df['review']

0       one reviewers mentioned watching 1 oz episode ...
1       wonderful little production. filming technique...
2       thought wonderful way spend time hot summer we...
3       basically there's family little boy (jake) thi...
4       petter mattei's "love time money" visually stu...
                              ...                        
9995    fun, entertaining movie wwii german spy (julie...
9996    give break. anyone say "good hockey movie"? kn...
9997    movie bad movie. watching endless series bad h...
9998    movie probably made entertain middle school, e...
9999    smashing film film-making. shows intense stran...
Name: review, Length: 9983, dtype: object

In [15]:
import gensim

In [16]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [17]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [19]:
story[0]

['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'hooked']

In [20]:
model = gensim.models.Word2Vec(window=10, min_count=2)

In [21]:
model.build_vocab(story)

In [22]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(5876696, 6212140)

In [23]:
len(model.wv.index_to_key)

31845

In [24]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [25]:
document_vector(df['review'].values[0])

array([-0.18494679,  0.43699002,  0.16048932,  0.24631253, -0.1422833 ,
       -0.6068399 ,  0.15761475,  0.906625  , -0.35256127, -0.2788689 ,
       -0.26252237, -0.5343362 ,  0.07811375,  0.11002094,  0.1740068 ,
       -0.13420074,  0.04474302, -0.32989848, -0.07852121, -0.6450878 ,
        0.06487583,  0.22969228,  0.15065111, -0.27424446, -0.32537818,
        0.03500761, -0.37256065,  0.0082764 , -0.35290253,  0.03250956,
        0.36262596,  0.02741224,  0.20604213, -0.29743782, -0.15854165,
        0.42732537,  0.04179445, -0.3938981 , -0.27436242, -0.7618269 ,
        0.10224623, -0.2800447 ,  0.05052528, -0.10727569,  0.5051041 ,
       -0.14537843, -0.26892328, -0.07993755,  0.08967319,  0.3761036 ,
        0.07436372, -0.37045392, -0.40847817, -0.08246459, -0.1072536 ,
        0.23057765,  0.23668619,  0.05327034, -0.28624958,  0.13105156,
        0.08968044,  0.04974905,  0.03300546, -0.07305313, -0.43777263,
        0.29062918,  0.02727411,  0.08017921, -0.31605732,  0.31

In [26]:
from tqdm import tqdm

In [27]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████████| 9983/9983 [20:12<00:00,  8.23it/s]


In [30]:
X[0].shape

(100,)

In [32]:
X = np.array(X)

In [33]:
X[0]

array([-0.18494679,  0.43699002,  0.16048932,  0.24631253, -0.1422833 ,
       -0.6068399 ,  0.15761475,  0.906625  , -0.35256127, -0.2788689 ,
       -0.26252237, -0.5343362 ,  0.07811375,  0.11002094,  0.1740068 ,
       -0.13420074,  0.04474302, -0.32989848, -0.07852121, -0.6450878 ,
        0.06487583,  0.22969228,  0.15065111, -0.27424446, -0.32537818,
        0.03500761, -0.37256065,  0.0082764 , -0.35290253,  0.03250956,
        0.36262596,  0.02741224,  0.20604213, -0.29743782, -0.15854165,
        0.42732537,  0.04179445, -0.3938981 , -0.27436242, -0.7618269 ,
        0.10224623, -0.2800447 ,  0.05052528, -0.10727569,  0.5051041 ,
       -0.14537843, -0.26892328, -0.07993755,  0.08967319,  0.3761036 ,
        0.07436372, -0.37045392, -0.40847817, -0.08246459, -0.1072536 ,
        0.23057765,  0.23668619,  0.05327034, -0.28624958,  0.13105156,
        0.08968044,  0.04974905,  0.03300546, -0.07305313, -0.43777263,
        0.29062918,  0.02727411,  0.08017921, -0.31605732,  0.31

In [34]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [35]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [39]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7651477215823735
              precision    recall  f1-score   support

           0       0.75      0.76      0.76       952
           1       0.78      0.77      0.77      1045

    accuracy                           0.77      1997
   macro avg       0.76      0.76      0.76      1997
weighted avg       0.77      0.77      0.77      1997

