### 1.Load Dataset

In [194]:
import pandas as pd

In [195]:
df = pd.read_csv('IMDB Dataset.csv')

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [197]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### 2.Text Preprocessing

In [198]:
# convert positive to 1 and negative to 0
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
df_num = ordinal_encoder.fit_transform(df[['sentiment']])
df['num_sentiment'] = df_num

In [199]:
df.drop(['sentiment'], axis=1, inplace=True)

In [200]:
df.head()

Unnamed: 0,review,num_sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production. <br /><br />The...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically there's a family where a little boy ...,0.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0


In [201]:
# Remove html coding
df['review'] = df['review'].str.replace('<.*?>','')

In [202]:
# Remove all puncuation and symbols
df['review'] = df['review'].str.replace('[^\w\s]','')

In [203]:
# Make everything lower case
df['review'] = df['review'].str.lower()

#### 2.1 Data without Removing Stopwords

In [204]:
df_wsw = df.copy()
df_wosw = df.copy()

In [205]:
# Tokenize
import nltk
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [206]:
def lemmatize_text(text):
    
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

In [207]:
df_wosw['lemma_review'] = df_wosw.review.apply(lemmatize_text)

In [208]:
df_wosw.head()

Unnamed: 0,review,num_sentiment,lemma_review
0,one of the other reviewers has mentioned that ...,1.0,one of the other reviewer ha mentioned that af...
1,a wonderful little production the filming tech...,1.0,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...,1.0,i thought this wa a wonderful way to spend tim...
3,basically theres a family where a little boy j...,0.0,basically there a family where a little boy ja...
4,petter matteis love in the time of money is a ...,1.0,petter matteis love in the time of money is a ...


#### 2.2 Data with Removing Stopwords

In [209]:
# Remove stop words
import spacy

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words
# # After seeing the word counts, update stop words
sp.Defaults.stop_words |= {'movie', 'film', 'like'}

In [210]:
df_wsw['review'] = df_wsw['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stopwords)]))

In [211]:
df_wsw['lemma_review'] = df_wsw.review.apply(lemmatize_text)

In [212]:
df_wsw.head()

Unnamed: 0,review,num_sentiment,lemma_review
0,reviewers mentioned watching 1 oz episode youl...,1.0,reviewer mentioned watching 1 oz episode youll...
1,wonderful little production filming technique ...,1.0,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...,1.0,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...,0.0,basically there family little boy jake think t...
4,petter matteis love time money visually stunni...,1.0,petter matteis love time money visually stunni...


### 3.Split Dataset

In [213]:
# keep 50% for the training set and 25% both for the validation and the test set.
from sklearn.model_selection import train_test_split

features = df_wsw.drop(['num_sentiment', 'review'], axis=1)
label = df_wsw['num_sentiment']

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.50, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size = 0.5, random_state=42)


print("Data distribution:\n- Train: {} \n- Validation: {} \n- Test: {}".format(len(X_train),len(X_val),len(X_test)))

Data distribution:
- Train: 25000 
- Validation: 12500 
- Test: 12500


### 4.Create Features using TF-IDF

In [214]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [215]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

#transformed train reviews
tv_train_reviews=tv.fit_transform(X_train['lemma_review'])

#transformed val,test reviews
tv_val_reviews=tv.transform(X_val['lemma_review'])
tv_test_reviews=tv.transform(X_test['lemma_review'])

print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_val_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (25000, 3869140)
Tfidf_test: (12500, 3869140)
Tfidf_test: (12500, 3869140)
