#TFIDF + Classical Model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Importing modules
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#Reading the train dataset
df_train_tfi = pd.read_csv('/content/drive/MyDrive/IMDB_NLP/Train_reviews.csv',usecols = ['Review','Review_label'])
#Reading the test dataset
df_test_tfi = pd.read_csv('/content/drive/MyDrive/IMDB_NLP/Test_reviews.csv',usecols = ['Review','Review_label'])

In [4]:
#Tokenizing the data
df_train_tfi['Review'] = df_train_tfi['Review'].apply(lambda x:word_tokenize(x))
df_test_tfi['Review'] = df_test_tfi['Review'].apply(lambda x:word_tokenize(x))

In [5]:
#Let's have a look!
df_train_tfi['Review']

0        [This, absolutely, terrible, movie, Dont, lure...
1        [I, known, fall, asleep, films, usually, due, ...
2        [Mann, photographs, Alberta, Rocky, Mountains,...
3        [This, kind, film, snowy, Sunday, afternoon, r...
4        [As, others, mentioned, women, go, nude, film,...
                               ...                        
24995    [I, severe, problem, show, several, actually, ...
24996    [The, year, 1964, Ernesto, Che, Guevara, Cuban...
24997    [Okay, So, I, got, back, Before, I, start, rev...
24998    [When, I, saw, trailer, TV, I, surprised, In, ...
24999    [First, Riget, wonderful, Good, comedy, myster...
Name: Review, Length: 25000, dtype: object

In [6]:
df_test_tfi['Review']

0        [There, films, make, careers, For, George, Rom...
1        [A, blackly, comic, tale, downtrodden, priest,...
2        [Scary, Movie, 14, Epic, Movie, Date, Movie, M...
3        [Poor, Shirley, MacLaine, tries, hard, lend, g...
4        [As, former, Erasmus, student, I, enjoyed, fil...
                               ...                        
24995    [Feeling, Minnesota, really, road, movie, that...
24996    [This, without, doubt, one, favourite, horror,...
24997    [Most, predicable, movie, Ive, ever, seenextre...
24998    [Its, exactly, I, expected, Relaxing, humorous...
24999    [They, dont, make, cartoons, like, used, This,...
Name: Review, Length: 25000, dtype: object

In [7]:
#Lets convert it to Lowercase!
df_train_tfi['Review'] = df_train_tfi['Review'].apply(lambda x: [i.lower() for i in x])
df_test_tfi['Review'] = df_test_tfi['Review'].apply(lambda x: [i.lower() for i in x])

In [8]:
#Lets have a look!
df_train_tfi['Review']

0        [this, absolutely, terrible, movie, dont, lure...
1        [i, known, fall, asleep, films, usually, due, ...
2        [mann, photographs, alberta, rocky, mountains,...
3        [this, kind, film, snowy, sunday, afternoon, r...
4        [as, others, mentioned, women, go, nude, film,...
                               ...                        
24995    [i, severe, problem, show, several, actually, ...
24996    [the, year, 1964, ernesto, che, guevara, cuban...
24997    [okay, so, i, got, back, before, i, start, rev...
24998    [when, i, saw, trailer, tv, i, surprised, in, ...
24999    [first, riget, wonderful, good, comedy, myster...
Name: Review, Length: 25000, dtype: object

In [9]:
df_test_tfi['Review']

0        [there, films, make, careers, for, george, rom...
1        [a, blackly, comic, tale, downtrodden, priest,...
2        [scary, movie, 14, epic, movie, date, movie, m...
3        [poor, shirley, maclaine, tries, hard, lend, g...
4        [as, former, erasmus, student, i, enjoyed, fil...
                               ...                        
24995    [feeling, minnesota, really, road, movie, that...
24996    [this, without, doubt, one, favourite, horror,...
24997    [most, predicable, movie, ive, ever, seenextre...
24998    [its, exactly, i, expected, relaxing, humorous...
24999    [they, dont, make, cartoons, like, used, this,...
Name: Review, Length: 25000, dtype: object

In [10]:
#Stemming 
porter = PorterStemmer()
df_train_tfi['Review'] = df_train_tfi['Review'].apply(lambda x:[porter.stem(i) for i in x])
df_test_tfi['Review'] = df_test_tfi['Review'].apply(lambda x:[porter.stem(i) for i in x])

In [11]:
#Changing list to string
df_train_tfi['Review'] = df_train_tfi['Review'].apply(lambda x:" ".join(x))
df_test_tfi['Review'] = df_test_tfi['Review'].apply(lambda x:" ".join(x))

In [12]:
#Lets have a look!
df_train_tfi['Review']

0        thi absolut terribl movi dont lure christoph w...
1        i known fall asleep film usual due combin thin...
2        mann photograph alberta rocki mountain superb ...
3        thi kind film snowi sunday afternoon rest worl...
4        as other mention women go nude film mostli abs...
                               ...                        
24995    i sever problem show sever actual a simpl list...
24996    the year 1964 ernesto che guevara cuban citize...
24997    okay so i got back befor i start review let te...
24998    when i saw trailer tv i surpris in may 2008 i ...
24999    first riget wonder good comedi mysteri thrille...
Name: Review, Length: 25000, dtype: object

In [13]:
df_test_tfi['Review']

0        there film make career for georg romero night ...
1        a blackli comic tale downtrodden priest nazari...
2        scari movi 14 epic movi date movi meet spartan...
3        poor shirley maclain tri hard lend gravita maw...
4        as former erasmu student i enjoy film much it ...
                               ...                        
24995    feel minnesota realli road movi that still bes...
24996    thi without doubt one favourit horror film eve...
24997    most predic movi ive ever seenextrem bore i fe...
24998    it exactli i expect relax humor entertain the ...
24999    they dont make cartoon like use thi one wit gr...
Name: Review, Length: 25000, dtype: object

TFIDF Vectorisation

TFIDF - Term Frequency- Inverse Document Frequency

In [14]:
#Using Tfidf Vectoriser
tfidf_vec = TfidfVectorizer(max_features = 1000,ngram_range = (1,2))
review_vec = tfidf_vec.fit_transform(df_train_tfi['Review']).toarray()

In [None]:
tfidf_vec_test = TfidfVectorizer(max_features = 1000,ngram_range = (1,2))
review_vec_test = tfidf_vec_test.fit_transform(df_test_tfi['Review']).toarray()

In [None]:
#Training and test data
X_train = review_vec
y_train = df_train_tfi['Review_label']
X_test = review_vec_test
y_test = df_test_tfi['Review_label']

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
import numpy as np

Logistic Regression

In [None]:
model_logistic=LogisticRegression()

In [19]:
solvers = [ 'lbfgs', 'liblinear']
penalty = ['l1','l2']     #Creating regularisation penalty space
C_values = [10,5, 1.0, 0.1, 0.01,0.02]    #Creating regularisation hyperparameter space
hyper_parameters = dict(solver = solvers, penalty = penalty, C = C_values  )       #Creating hyperparameters space for Grid Search CV

In [20]:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=1)
grid_searchcv = GridSearchCV(estimator=model_logistic, param_grid=hyper_parameters, n_jobs=-1, cv=cv, scoring='accuracy')
grid_model_logistic = grid_searchcv.fit(X_train, y_train)
print("Best parameters =", grid_model_logistic.best_params_)

120 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.85764 0.85796 0.85797     nan 0.82

Best parameters = {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}


In [21]:
model_logistic_best = LogisticRegression(C = 1, penalty = 'l2', solver = 'liblinear').fit(X_train, y_train)

In [22]:
model_logistic_best.score(X_test,y_test)

0.5774

References:

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

https://www.kaggle.com/code/sohamdas27/imdb-movie-review-eda-sentiment-analysis

https://youtu.be/*6C0sLtw5ctc*