### 1.Load Dataset

In [1]:
import pandas as pd

In [60]:
df = pd.read_csv('IMDB Dataset.csv')

### 2.Text Preprocessing

In [61]:
# convert positive to 1 and negative to 0
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
df_num = ordinal_encoder.fit_transform(df[['sentiment']])
df['num_sentiment'] = df_num

In [62]:
df.drop(['sentiment'], axis=1, inplace=True)

In [63]:
# Remove html coding
df['review'] = df['review'].str.replace('<.*?>','')

In [64]:
# Remove all puncuation and symbols
df['review'] = df['review'].str.replace('[^\w\s]','')

In [65]:
# Make everything lower case
df['review'] = df['review'].str.lower()

#### 2.1 Data with Stopwords

In [66]:
df_wsw = df.copy()
df_wosw = df.copy()

In [67]:
#Tokenize
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [68]:
def lemmatize_text(text):
    
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

In [69]:
df_wsw['lemma_review'] = df_wsw.review.apply(lemmatize_text)

In [70]:
#Save as csv file
df_wsw.to_csv('df_wsw.csv', index=False)

In [2]:
#Load dataset
df_wsw = pd.read_csv('df_wsw.csv')

In [3]:
df_wsw.head()

Unnamed: 0,review,num_sentiment,lemma_review
0,one of the other reviewers has mentioned that ...,1.0,one of the other reviewer ha mentioned that af...
1,a wonderful little production the filming tech...,1.0,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...,1.0,i thought this wa a wonderful way to spend tim...
3,basically theres a family where a little boy j...,0.0,basically there a family where a little boy ja...
4,petter matteis love in the time of money is a ...,1.0,petter matteis love in the time of money is a ...


#### 2.2 Data without Stopwords

In [75]:
#Remove stop words
import spacy

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words
## After seeing the word counts, update stop words
sp.Defaults.stop_words |= {'movie', 'film', 'like'}

In [76]:
df_wosw['review'] = df_wosw['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (all_stopwords)]))

In [77]:
df_wosw['lemma_review'] = df_wosw.review.apply(lemmatize_text)

In [78]:
#Save as csv file
df_wosw.to_csv('df_wosw.csv', index=False)

In [4]:
#Load dataset
df_wosw = pd.read_csv('df_wosw.csv')

In [5]:
df_wosw.head()

Unnamed: 0,review,num_sentiment,lemma_review
0,reviewers mentioned watching 1 oz episode youl...,1.0,reviewer mentioned watching 1 oz episode youll...
1,wonderful little production filming technique ...,1.0,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...,1.0,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...,0.0,basically there family little boy jake think t...
4,petter matteis love time money visually stunni...,1.0,petter matteis love time money visually stunni...


### 2.Build Model with dataset with stopwords using Sentiment Score

#### 2.1 Create Sentiment Score

In [81]:
#Create a function to get subjectivity and polarity
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def Subjectivity(text):
    
    return TextBlob(text).sentiment.subjectivity

def Polarity(text):
    
    return TextBlob(text).sentiment.polarity

In [82]:
#Create two columns 'Subjectivity' and 'Polarity'
df_wsw['Subjectivity'] = df_wsw['review'].apply(Subjectivity)
df_wsw['Polarity'] = df_wsw['review'].apply(Polarity)

In [85]:
#Create a function to get sentiment scores
def SIA(text):
    
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    
    return sentiment

In [87]:
#Get the sentiment scores for each review
compound = []
neg = []
pos = []
neu = []
Sentiment = 0

for i in range(0, len(df_wsw['review'])):
    
    Sentiment = SIA(df_wsw['review'][i])

    compound.append(Sentiment['compound']) 
    neg.append(Sentiment['neg'])
    pos.append(Sentiment['pos'])
    neu.append(Sentiment['neu'])

In [88]:
#Store the sentiment scores in the merge data set
df_wsw['Compound'] = compound
df_wsw['Negative'] = neg
df_wsw['Neutral'] = neu
df_wsw['Positive'] = pos

In [90]:
#Save as csv file
df_wsw.to_csv('IMDB Dataset with Sentiment Score and stopwords.csv', index=False)

In [91]:
#Load dataset with sentiment score
df_wsw_sentiment = pd.read_csv('IMDB Dataset with Sentiment Score and stopwords.csv')

In [92]:
df_wsw_sentiment.head()

Unnamed: 0,review,num_sentiment,lemma_review,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,one of the other reviewers has mentioned that ...,1.0,one of the other reviewer ha mentioned that af...,0.490369,0.023433,-0.9916,0.182,0.752,0.066
1,a wonderful little production the filming tech...,1.0,a wonderful little production the filming tech...,0.559343,0.11149,0.9657,0.053,0.766,0.18
2,i thought this was a wonderful way to spend ti...,1.0,i thought this wa a wonderful way to spend tim...,0.640769,0.346324,0.9579,0.116,0.675,0.21
3,basically theres a family where a little boy j...,0.0,basically there a family where a little boy ja...,0.454167,-0.060937,-0.9117,0.129,0.808,0.063
4,petter matteis love in the time of money is a ...,1.0,petter matteis love in the time of money is a ...,0.452916,0.217952,0.9744,0.052,0.8,0.148


#### 2.2 Split Dataset

In [124]:
from sklearn.model_selection import train_test_split

features = df_wsw_sentiment.drop(['num_sentiment', 'review', 'lemma_review'], axis=1)
label = df_wsw_sentiment['num_sentiment']

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.50, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size = 0.5, random_state=42)


print("Data distribution:\n- Train: {} \n- Validation: {} \n- Test: {}".format(len(X_train),len(X_val),len(X_test)))

Data distribution:
- Train: 25000 
- Validation: 12500 
- Test: 12500


#### 2.3 Build Model

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [95]:
# Build the base model
rf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf, X_train, y_train.values.ravel(), cv=5)

In [96]:
print(scores)
print(scores.mean())

[0.772  0.767  0.7704 0.7688 0.7658]


0.7688

In [6]:
# define a function to print results
def print_results(results):
    
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    
    for mean, params in zip(means, results.cv_results_['params']):
        
        print('{} for {}'.format(round(mean, 5), params))

In [7]:
# GridSearchCV to choose the best parameters        
from sklearn.model_selection import GridSearchCV

def gridsearch(x_train, y_train):
    
    rf = RandomForestClassifier(random_state=42)
    parameters = {
        
    'n_estimators': [5, 50, 100, 500, 1000, 3000],
    'max_depth': [2, 10, 50, None]
        
    }
    
    cv = GridSearchCV(rf, parameters)
    cv.fit(x_train, y_train.values.ravel())
    
    return cv

In [99]:
print_results(gridsearch(X_train, y_train))

BEST PARAMS: {'max_depth': 10, 'n_estimators': 3000}

0.74044 for {'max_depth': 2, 'n_estimators': 5}
0.76888 for {'max_depth': 2, 'n_estimators': 50}
0.77008 for {'max_depth': 2, 'n_estimators': 100}
0.77016 for {'max_depth': 2, 'n_estimators': 500}
0.76984 for {'max_depth': 2, 'n_estimators': 1000}
0.76972 for {'max_depth': 2, 'n_estimators': 3000}
0.76584 for {'max_depth': 10, 'n_estimators': 5}
0.77664 for {'max_depth': 10, 'n_estimators': 50}
0.77696 for {'max_depth': 10, 'n_estimators': 100}
0.7766 for {'max_depth': 10, 'n_estimators': 500}
0.77712 for {'max_depth': 10, 'n_estimators': 1000}
0.77732 for {'max_depth': 10, 'n_estimators': 3000}
0.74028 for {'max_depth': 50, 'n_estimators': 5}
0.76792 for {'max_depth': 50, 'n_estimators': 50}
0.7688 for {'max_depth': 50, 'n_estimators': 100}
0.77116 for {'max_depth': 50, 'n_estimators': 500}
0.77248 for {'max_depth': 50, 'n_estimators': 1000}
0.77188 for {'max_depth': 50, 'n_estimators': 3000}
0.74028 for {'max_depth': None, 'n_esti

In [125]:
# Use validation set to select the best parameters
rf1 = RandomForestClassifier(n_estimators=3000, max_depth=10)
rf1.fit(X_train, y_train.values.ravel())

rf2 = RandomForestClassifier(n_estimators=1000, max_depth=50)
rf2.fit(X_train, y_train.values.ravel())

rf3 = RandomForestClassifier(n_estimators=500, max_depth=2)
rf3.fit(X_train, y_train.values.ravel())

rf4 = RandomForestClassifier(n_estimators=1000, max_depth=None)
rf4.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [8]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

def performance(rf1, rf2, rf3, rf4, X_val):
    
    for mdl in [rf1,rf2,rf3, rf4]:
        
        y_pred = mdl.predict(X_val)
        
        accuracy = round(accuracy_score(y_val,y_pred), 3)
        precision = round(precision_score(y_val,y_pred), 3)
        recall = round(recall_score(y_val,y_pred), 3)
        f1 = round(f1_score(y_val,y_pred), 3)
    
        print('MAX DEPTH: {}, # OF EST: {} -- Accuracy: {}, Precision: {}, Recall: {}, F1-Score: {}'.format(mdl.max_depth,
                                                                                                            mdl.n_estimators,
                                                                                                            accuracy,
                                                                                                            precision,
                                                                                                            recall,
                                                                                                            f1))

In [127]:
performance(rf1, rf2, rf3, rf4, X_val)

MAX DEPTH: 10, # OF EST: 3000 -- Accuracy: 0.77, Precision: 0.767, Recall: 0.768, F1-Score: 0.768
MAX DEPTH: 50, # OF EST: 1000 -- Accuracy: 0.762, Precision: 0.758, Recall: 0.76, F1-Score: 0.759
MAX DEPTH: 2, # OF EST: 500 -- Accuracy: 0.767, Precision: 0.77, Recall: 0.751, F1-Score: 0.76
MAX DEPTH: None, # OF EST: 1000 -- Accuracy: 0.762, Precision: 0.759, Recall: 0.759, F1-Score: 0.759


In [17]:
def final_performance(rf, X_test):
    
    y_pred = rf.predict(X_test)
    
    accuracy = round(accuracy_score(y_test,y_pred), 3)
    precision = round(precision_score(y_test,y_pred), 3)
    recall = round(recall_score(y_test,y_pred), 3)
    f1 = round(f1_score(y_test,y_pred), 3)
    
    print('MAX DEPTH: {}, # OF EST: {} -- Accuracy: {}, Precison: {}, Recall: {}, F1-Score: {}'.format(rf.max_depth,
                                                                                                       rf.n_estimators,
                                                                                                       accuracy,
                                                                                                       precision,
                                                                                                       recall,
                                                                                                       f1))

In [129]:
final_performance(rf1, X_test)

MAX DEPTH: 10, # OF EST: 3000 -- Accuracy: 0.774, Precison: 0.778, Recall: 0.778, F1-Score: 0.778


### 3.Build Model with dataset without stopwords using Sentiment Score

#### 3.1 Create Sentiment Score

In [109]:
#Create two columns 'Subjectivity' and 'Polarity'
df_wosw['Subjectivity'] = df_wosw['review'].apply(Subjectivity)
df_wosw['Polarity'] = df_wosw['review'].apply(Polarity)

In [110]:
#Get the sentiment scores for each review
compound = []
neg = []
pos = []
neu = []
Sentiment = 0

for i in range(0, len(df_wosw['review'])):
    
    Sentiment = SIA(df_wosw['review'][i])

    compound.append(Sentiment['compound']) 
    neg.append(Sentiment['neg'])
    pos.append(Sentiment['pos'])
    neu.append(Sentiment['neu'])

In [111]:
#Store the sentiment scores in the merge data set
df_wosw['Compound'] = compound
df_wosw['Negative'] = neg
df_wosw['Neutral'] = neu
df_wosw['Positive'] = pos

In [112]:
#Save as csv file
df_wosw.to_csv('IMDB Dataset with Sentiment Score and without stopwords.csv', index=False)

In [113]:
#Load dataset with sentiment score
df_wosw_sentiment = pd.read_csv('IMDB Dataset with Sentiment Score and without stopwords.csv')

In [114]:
df_wosw_sentiment.head()

Unnamed: 0,review,num_sentiment,lemma_review,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,reviewers mentioned watching 1 oz episode youl...,1.0,reviewer mentioned watching 1 oz episode youll...,0.522282,0.025685,-0.9948,0.347,0.568,0.085
1,wonderful little production filming technique ...,1.0,wonderful little production filming technique ...,0.592222,0.122778,0.9153,0.091,0.66,0.249
2,thought wonderful way spend time hot summer we...,1.0,thought wonderful way spend time hot summer we...,0.692381,0.349048,0.9666,0.143,0.508,0.348
3,basically theres family little boy jake thinks...,0.0,basically there family little boy jake think t...,0.471429,-0.105357,-0.945,0.271,0.663,0.066
4,petter matteis love time money visually stunni...,1.0,petter matteis love time money visually stunni...,0.442848,0.228697,0.9871,0.034,0.675,0.291


#### 3.2 Split Dataset

In [130]:
features = df_wosw_sentiment.drop(['num_sentiment', 'review', 'lemma_review'], axis=1)
label = df_wosw_sentiment['num_sentiment']

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.50, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size = 0.5, random_state=42)

print("Data distribution:\n- Train: {} \n- Validation: {} \n- Test: {}".format(len(X_train),len(X_val),len(X_test)))

Data distribution:
- Train: 25000 
- Validation: 12500 
- Test: 12500


#### 3.3 Build Model

In [131]:
rf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf, X_train, y_train.values.ravel(), cv=5)

print(scores)
print(scores.mean())

[0.7512 0.7576 0.756  0.7514 0.7496]
0.75316


In [118]:
print_results(gridsearch(X_train, y_train))

BEST PARAMS: {'max_depth': 10, 'n_estimators': 1000}

0.7302 for {'max_depth': 2, 'n_estimators': 5}
0.75812 for {'max_depth': 2, 'n_estimators': 50}
0.75988 for {'max_depth': 2, 'n_estimators': 100}
0.75944 for {'max_depth': 2, 'n_estimators': 500}
0.75876 for {'max_depth': 2, 'n_estimators': 1000}
0.75944 for {'max_depth': 2, 'n_estimators': 3000}
0.75532 for {'max_depth': 10, 'n_estimators': 5}
0.761 for {'max_depth': 10, 'n_estimators': 50}
0.76156 for {'max_depth': 10, 'n_estimators': 100}
0.76204 for {'max_depth': 10, 'n_estimators': 500}
0.76208 for {'max_depth': 10, 'n_estimators': 1000}
0.762 for {'max_depth': 10, 'n_estimators': 3000}
0.72268 for {'max_depth': 50, 'n_estimators': 5}
0.7508 for {'max_depth': 50, 'n_estimators': 50}
0.75316 for {'max_depth': 50, 'n_estimators': 100}
0.75508 for {'max_depth': 50, 'n_estimators': 500}
0.75568 for {'max_depth': 50, 'n_estimators': 1000}
0.75552 for {'max_depth': 50, 'n_estimators': 3000}
0.72268 for {'max_depth': None, 'n_estimato

In [119]:
# Use validation set to select the best parameters
rf1 = RandomForestClassifier(n_estimators=1000, max_depth=10)
rf1.fit(X_train, y_train.values.ravel())

rf2 = RandomForestClassifier(n_estimators=1000, max_depth=50)
rf2.fit(X_train, y_train.values.ravel())

rf3 = RandomForestClassifier(n_estimators=100, max_depth=2)
rf3.fit(X_train, y_train.values.ravel())

rf4 = RandomForestClassifier(n_estimators=1000, max_depth=None)
rf4.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [120]:
performance(rf1, rf2, rf3, rf4, X_val)

MAX DEPTH: 10, # OF EST: 1000 -- Accuracy: 0.758, Precision: 0.748, Recall: 0.767, F1-Score: 0.758
MAX DEPTH: 50, # OF EST: 1000 -- Accuracy: 0.754, Precision: 0.747, Recall: 0.757, F1-Score: 0.752
MAX DEPTH: 2, # OF EST: 100 -- Accuracy: 0.753, Precision: 0.747, Recall: 0.754, F1-Score: 0.751
MAX DEPTH: None, # OF EST: 1000 -- Accuracy: 0.752, Precision: 0.745, Recall: 0.755, F1-Score: 0.75


In [123]:
final_performance(rf1, X_test)

MAX DEPTH: 10, # OF EST: 1000 -- Accuracy: 0.769, Precison: 0.767, Recall: 0.782, F1-Score: 0.775


### 4.Build Model with dataset with stopwords using TF-IDF

#### 4.1 Split Dataset which has stopwords

In [12]:
# keep 50% for the training set and 25% both for the validation and the test set.
from sklearn.model_selection import train_test_split

df_wsw = pd.read_csv('df_wsw.csv')

features = df_wsw.drop(['num_sentiment', 'review'], axis=1)
label = df_wsw['num_sentiment']

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.70, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.50, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size = 0.5, random_state=42)

print("Data distribution:\n- Train: {} \n- Validation: {} \n- Test: {}".format(len(X_train),len(X_val),len(X_test)))

Data distribution:
- Train: 7500 
- Validation: 3750 
- Test: 3750


#### 4.2 Create Features using TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Tfidf vectorizer
tv = TfidfVectorizer()

#transformed train, val, test reviews
tv_train=tv.fit_transform(X_train['lemma_review'])
tv_val=tv.transform(X_val['lemma_review'])
tv_test=tv.transform(X_test['lemma_review'])

print('Tfidf_train:', tv_train.shape)
print('Tfidf_test:', tv_val.shape)
print('Tfidf_test:', tv_test.shape)

Tfidf_train: (7500, 63948)
Tfidf_test: (3750, 63948)
Tfidf_test: (3750, 63948)


#### 4.3 Build Model

In [152]:
# Base Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf,tv_train,y_train.values.ravel(),cv=5)

print(scores)
print(scores.mean())

[0.81933333 0.83733333 0.82133333 0.80866667 0.832     ]
0.8237333333333334


In [153]:
print_results(gridsearch(tv_train, y_train))

BEST PARAMS: {'max_depth': None, 'n_estimators': 1000}

0.57813 for {'max_depth': 2, 'n_estimators': 5}
0.71107 for {'max_depth': 2, 'n_estimators': 50}
0.74933 for {'max_depth': 2, 'n_estimators': 100}
0.79467 for {'max_depth': 2, 'n_estimators': 500}
0.8016 for {'max_depth': 2, 'n_estimators': 1000}
0.8052 for {'max_depth': 2, 'n_estimators': 3000}
0.66747 for {'max_depth': 10, 'n_estimators': 5}
0.79093 for {'max_depth': 10, 'n_estimators': 50}
0.806 for {'max_depth': 10, 'n_estimators': 100}
0.83253 for {'max_depth': 10, 'n_estimators': 500}
0.83227 for {'max_depth': 10, 'n_estimators': 1000}
0.8348 for {'max_depth': 10, 'n_estimators': 3000}
0.67947 for {'max_depth': 50, 'n_estimators': 5}
0.80627 for {'max_depth': 50, 'n_estimators': 50}
0.82853 for {'max_depth': 50, 'n_estimators': 100}
0.8396 for {'max_depth': 50, 'n_estimators': 500}
0.8436 for {'max_depth': 50, 'n_estimators': 1000}
0.84373 for {'max_depth': 50, 'n_estimators': 3000}
0.6724 for {'max_depth': None, 'n_estimato

In [14]:
# Use validation set to select the best parameters
rf1 = RandomForestClassifier(n_estimators=1000, max_depth=None)
rf1.fit(tv_train, y_train.values.ravel())

rf2 = RandomForestClassifier(n_estimators=500, max_depth=50)
rf2.fit(tv_train, y_train.values.ravel())

rf3 = RandomForestClassifier(n_estimators=500, max_depth=10)
rf3.fit(tv_train, y_train.values.ravel())

rf4 = RandomForestClassifier(n_estimators=500, max_depth=None)
rf4.fit(tv_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
performance(rf1, rf2, rf3, rf4, tv_val)

MAX DEPTH: None, # OF EST: 1000 -- Accuracy: 0.826, Precision: 0.834, Recall: 0.822, F1-Score: 0.828
MAX DEPTH: 50, # OF EST: 500 -- Accuracy: 0.827, Precision: 0.835, Recall: 0.825, F1-Score: 0.83
MAX DEPTH: 10, # OF EST: 500 -- Accuracy: 0.821, Precision: 0.806, Recall: 0.855, F1-Score: 0.83
MAX DEPTH: None, # OF EST: 500 -- Accuracy: 0.82, Precision: 0.826, Recall: 0.819, F1-Score: 0.823


In [18]:
final_performance(rf2, tv_test)

MAX DEPTH: 50, # OF EST: 500 -- Accuracy: 0.848, Precison: 0.848, Recall: 0.84, F1-Score: 0.844


### 5.Build Model with dataset without stopwords using TF-IDF

#### 5.1 Split Dataset which do not have stopwords

In [19]:
from sklearn.model_selection import train_test_split

df_wosw = pd.read_csv('df_wosw.csv')

features = df_wosw.drop(['num_sentiment', 'review'], axis=1)
label = df_wosw['num_sentiment']

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.70, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.50, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size = 0.5, random_state=42)

print("Data distribution:\n- Train: {} \n- Validation: {} \n- Test: {}".format(len(X_train),len(X_val),len(X_test)))

Data distribution:
- Train: 7500 
- Validation: 3750 
- Test: 3750


#### 5.2 Create Features using TF-IDF

In [20]:
tv = TfidfVectorizer()

tv_train=tv.fit_transform(X_train['lemma_review'])
tv_val=tv.transform(X_val['lemma_review'])
tv_test=tv.transform(X_test['lemma_review'])

print('Tfidf_train:',tv_train.shape)
print('Tfidf_test:',tv_val.shape)
print('Tfidf_test:',tv_test.shape)

Tfidf_train: (7500, 63728)
Tfidf_test: (3750, 63728)
Tfidf_test: (3750, 63728)


#### 5.3 Build Model

In [21]:
# Base Model
rf = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf,tv_train,y_train.values.ravel(),cv=5)

print(scores)
print(scores.mean())

[0.834      0.85266667 0.84866667 0.81066667 0.84      ]
0.8371999999999999


In [22]:
print_results(gridsearch(tv_train, y_train))

BEST PARAMS: {'max_depth': 50, 'n_estimators': 3000}

0.5864 for {'max_depth': 2, 'n_estimators': 5}
0.72227 for {'max_depth': 2, 'n_estimators': 50}
0.75947 for {'max_depth': 2, 'n_estimators': 100}
0.792 for {'max_depth': 2, 'n_estimators': 500}
0.8032 for {'max_depth': 2, 'n_estimators': 1000}
0.80813 for {'max_depth': 2, 'n_estimators': 3000}
0.66947 for {'max_depth': 10, 'n_estimators': 5}
0.79853 for {'max_depth': 10, 'n_estimators': 50}
0.81747 for {'max_depth': 10, 'n_estimators': 100}
0.836 for {'max_depth': 10, 'n_estimators': 500}
0.83853 for {'max_depth': 10, 'n_estimators': 1000}
0.84213 for {'max_depth': 10, 'n_estimators': 3000}
0.7116 for {'max_depth': 50, 'n_estimators': 5}
0.822 for {'max_depth': 50, 'n_estimators': 50}
0.83733 for {'max_depth': 50, 'n_estimators': 100}
0.85067 for {'max_depth': 50, 'n_estimators': 500}
0.85293 for {'max_depth': 50, 'n_estimators': 1000}
0.85747 for {'max_depth': 50, 'n_estimators': 3000}
0.71013 for {'max_depth': None, 'n_estimators'

In [23]:
# Use validation set to select the best parameters
rf1 = RandomForestClassifier(n_estimators=3000, max_depth=50)
rf1.fit(tv_train, y_train.values.ravel())

rf2 = RandomForestClassifier(n_estimators=3000, max_depth=None)
rf2.fit(tv_train, y_train.values.ravel())

rf3 = RandomForestClassifier(n_estimators=3000, max_depth=10)
rf3.fit(tv_train, y_train.values.ravel())

rf4 = RandomForestClassifier(n_estimators=1000, max_depth=50)
rf4.fit(tv_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
performance(rf1, rf2, rf3, rf4, tv_val)

MAX DEPTH: 50, # OF EST: 3000 -- Accuracy: 0.833, Precision: 0.839, Recall: 0.834, F1-Score: 0.836
MAX DEPTH: None, # OF EST: 3000 -- Accuracy: 0.833, Precision: 0.841, Recall: 0.829, F1-Score: 0.835
MAX DEPTH: 10, # OF EST: 3000 -- Accuracy: 0.826, Precision: 0.802, Recall: 0.874, F1-Score: 0.837
MAX DEPTH: 50, # OF EST: 1000 -- Accuracy: 0.828, Precision: 0.833, Recall: 0.829, F1-Score: 0.831


In [33]:
final_performance(rf2, tv_test)

MAX DEPTH: None, # OF EST: 3000 -- Accuracy: 0.852, Precison: 0.849, Recall: 0.849, F1-Score: 0.849
