In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score, accuracy_score
import pickle

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
data = pd.read_csv("datasets/reviews.txt", sep="\t", names=['Reviews', 'Comments'])

In [None]:
data.tail()

Unnamed: 0,Reviews,Comments
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.
6917,0,"Oh, and Brokeback Mountain was a terrible movie."


In [None]:
stopset = set(stopwords.words('english'))

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
data['Comments'] = data['Comments'].apply(stem)

In [None]:
X = data['Comments']

In [None]:
Y = data['Reviews'].values
Y

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
pickle.dump(vectorizer, open('transform.pkl', 'wb'))

In [None]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,Y_train)

MultinomialNB()

In [None]:
accuracy_score(Y_test, clf.predict(X_test))*100

97.76011560693641

In [None]:
filename = 'model.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
def GorB(str):
    nice_test = np.array([str])
    pred = vectorizer.transform(nice_test)
    return 'Good Comment' if (clf.predict(pred)) else 'Bad Comment' 

In [None]:
GorB("Terrible Movie")

'Bad Comment'

In [None]:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://www.imdb.com/title/{}/reviews?ref_=tt_ov_rt'.format("tt0111161")).read()
soup = bs.BeautifulSoup(sauce, 'lxml')
soup_result = soup.find_all("div",{"class":"text show-more__control"})

reviews_list = []
reviews_status = []
for reviews in soup_result:
    if reviews.string:
       reviews_list.append(reviews.string)
       movie_review_list = np.array([reviews.string])
       movie_vector = vectorizer.transform(movie_review_list)
       pred = clf.predict(movie_vector)
       reviews_status.append('Good' if pred else 'Bad')

reviews_status

['Good', 'Good', 'Good', 'Good', 'Good']