In [87]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np
import string
import re

In [88]:
imdb = pd.read_csv('IMDB Dataset.csv')[:5000]

*** ***

# Cleaning

In [216]:
def remove_br(string):
    raw_string = r"{}".format(string)
    new_str = re.sub("<br />", '', raw_string)
    pattern = r'\\'
    new_str = re.sub(pattern, '', new_str)
    new_str = new_str.lower()
    return new_str

In [217]:
no_br_reviews = [remove_br(review) for review in imdb['review']]

In [218]:
no_br_reviews[1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [120]:
imdb['no_br_strings'] = no_br_reviews

In [121]:
imdb.head()

Unnamed: 0,review,sentiment,no_br_strings
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production. the filming tec...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"petter mattei's ""love in the time of money"" is..."


*** ***

# Td-Idf

*Bulding the matrix with the Td-Idf scores, the features of the machine learning models*

In [162]:
vect = TfidfVectorizer(stop_words = ENGLISH_STOP_WORDS, max_features=5000)

In [163]:
X = vect.fit_transform(imdb['no_br_strings'], imdb['sentiment'])
y = imdb['sentiment']

In [164]:
df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
df.head()

Unnamed: 0,000,10,100,11,12,13,13th,14,15,16,...,yesterday,york,young,younger,youth,zero,zoey,zombie,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.082603,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.062443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223644,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.102777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


*** ***

# Logistic Regression

In [173]:
#Passing the dense matrix to the model

X = df.copy() 

In [174]:
#Splitting the data

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [219]:
#Training

log_reg = LogisticRegression(max_iter=300)
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [220]:
#Tuning 

from sklearn.model_selection import GridSearchCV
parameters = {
            'penalty':['l2'],
            'C':[1,10,100],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'],
        }
GS = GridSearchCV(log_reg, parameters,cv=10,verbose=1)
GS.fit(X,y)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 19.5min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=300, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10, 100], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [221]:
print('Best parameters:', GS.best_params_)
print('Best score:', GS.best_score_)

Best parameters: {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.8598000000000001


*** ***

In [208]:
#Training with best parameters
log_reg = LogisticRegression(**GS.best_params_)
log_reg.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [209]:
#Prediction and score

y_pred = log_reg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy Score =', acc)

Accuracy Score = 0.864


*** ***

# Recommender

*The idea is to build a 'review recommender' that returns reviews similars to the input review. The tool can be used for clustering users, movie recommendations, topic modeling, etc*

In [48]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,neutral
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [177]:
#Building the cosine simmilarity matrix

cosine_similarities = linear_kernel(X, X)

In [202]:
#List of similiarities scores (Review 3)

sim_scores = list(enumerate(cosine_similarities[3]))
sim_scores[:20]

[(0, 0.013257008608555986),
 (1, 0.01356644478406045),
 (2, 0.0022480557888732437),
 (3, 1.0000000000000004),
 (4, 0.013769226758691194),
 (5, 0.01901123628297709),
 (6, 0.009093698114802174),
 (7, 0.006479039585715199),
 (8, 0.018246089087183445),
 (9, 0.02456206912382156),
 (10, 0.03683358266993418),
 (11, 0.013504952975059943),
 (12, 0.02762853336157901),
 (13, 0.029914162899998053),
 (14, 0.024581674585655706),
 (15, 0.027527806646368648),
 (16, 0.022441955267229603),
 (17, 0.0326790449811334),
 (18, 0.02314954832027493),
 (19, 0.0212305404865984)]

In [180]:
#The 10 highest similarities

sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[:11]
sim_scores

[(3, 1.0000000000000002),
 (4635, 0.3420181053856195),
 (1476, 0.3290418558155839),
 (418, 0.3243037345590046),
 (1137, 0.32318123698806706),
 (259, 0.3219048514001504),
 (4525, 0.3057911870240598),
 (787, 0.3045910083090193),
 (126, 0.30279953018473915),
 (3890, 0.30004357634490153),
 (1664, 0.29954172010369284)]

In [181]:
#Getting the indices of the most similar reviews

top_ten_idx = [i[0] for i in sim_scores][1:]
top_ten_idx

[4635, 1476, 418, 1137, 259, 4525, 787, 126, 3890, 1664]

In [182]:
# Base review - The input
imdb.iloc[3]['review']

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

<br>

*The recommendations. We can see that the reviews are similar, related to "zombie movies".*

In [183]:
#Reccomendation #1

print(imdb.iloc[4635]['review'])

Anyone who has experienced the terrors of divorce will empathize with this indie film's protagonist, a scared little boy who believes a zombie is hiding in his closet. Is Jake (a mesmerizing Anthony DeMarco) simply "transferring" the trauma of two bickering parents to an understandable image? Or could the creature be real? Writer/director Shelli Ryan neatly balances both possibilities and keeps the audience guessing. Her choice of using one setting - a suburban house - adds to the feeling of desperation and claustrophobia.<br /><br />Brooke Bloom and Peter Sean Bridgers are highly convincing as the angry, but loving parents. However it is the creepy minor characters, Mrs. Bender(Barbara Gruen), an unhinged babysitter and Sam Stone (Ben Bode), a sleazy Real estate agent that linger in the mind. Jake's Closet is a darkly inspired portrait of childhood as a special kind of Hell.


In [184]:
#Recommendatio #2

imdb.iloc[1476]['review']

'Last weekend I bought this \'zombie movie\' from the bargain bin and watched it with some friends thinking it was going to be a budget version of "Land of the Dead".<br /><br />Boy, was I wrong. <br /><br />It seems as if they spent a good portion of their budget on the cover-art, which is very misleading to fans of the zombie genre.<br /><br />We watched up to the point where the zombie chicks come alive and get in the car with some yuppie who is out in the middle of nowhere talking business on a cell-phone. They actually speak to the guy before one of the girls kills him; but once they started driving the car, I couldn\'t suspend my disbelief anymore.<br /><br />Some people actually consider this a "so bad, it\'s good" movie, they are liars. I didn\'t finish the movie, but one of the other reviews mention that they actually somehow become police officers at the end of the movie, which makes me glad to not have watched it all the way through.<br /><br />This is even worse than "Zombi

In [185]:
#Recommendation #3

imdb.iloc[418]['review']

'Okay, I\'ve watched this movie twice now, I have researched it heavily on the net, I have asked several people on there opinions. I have even gone to the length of reading the original Sheridan Lafanu Classic \'Carmilla\', a book that this movie is supposed to be based on. I feel that the best way to review this movie is to describe a game to play whilst watching it. As the plot of the movie doesn\'t seem to make any sense at all, here is the plot of the book.<br /><br />Laura lives in a castle in Syberia with her Father, Mr De Lafontaine. They carry on with their lives blissfully and peacefully. One day they get a letter from the \'General\' a man who has made it his mission in life to avenge his daughters death. He makes claims of supernatural powers being at work, and explains that he will visit them soon. Meanwhile, a chance encounter with a strange woman results in the Lafontaines looking after her Daughter, Carmilla, for several months. Soon Laura starts to be overwhelmed by str