In [21]:
import numpy as np
import pandas as pd
import nltk, re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string

In [24]:
stop_words = stopwords.words('english')
tfid = TfidfVectorizer()
bow = CountVectorizer()

In [6]:
len(stopwords)

179

In [15]:
#read data
pos_rev = pd.read_csv('pos.txt',sep='\n',encoding='unicode_escape', header = None) #
#change column name 0 to review
pos_rev.rename(columns = {0:'review'}, inplace = True)
pos_rev['mood'] = 1

In [16]:
#read data
neg_rev = pd.read_csv('negative.txt',sep='\n',encoding='unicode_escape', header = None) #
#change column name 0 to review
neg_rev.rename(columns = {0:'review'}, inplace = True)
neg_rev['mood'] = 0

In [17]:
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [25]:
# converter to lowercase
pos_rev['review'] = pos_rev['review'].apply(lambda x : x.lower())

# remove the numbers and hyphen
pos_rev['review'] = pos_rev['review'].apply(lambda x : re.sub(r"[0-9-]"," ",x)) # 0-9 removes numbers & - removes hyphen

# remove the '''
pos_rev['review'] = pos_rev['review'].apply(lambda x : re.sub(r"\W"," ",x)) #\W => not a word century's=> century s 

# remove the 's'
pos_rev['review'] = pos_rev['review'].apply(lambda x : re.sub(r"\b\w\b"," ",x)) 


# removing the puncatuation
pos_rev['review'] = pos_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))

# remove the stopwords
pos_rev['review'] = pos_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in stop_words]))


In [26]:
# converter to lowercase
neg_rev['review'] = neg_rev['review'].apply(lambda x : x.lower())

# remove the numbers and hyphen
neg_rev['review'] = neg_rev['review'].apply(lambda x : re.sub(r"[0-9-]"," ",x)) # 0-9 removes numbers & - removes hyphen

# remove the '''
neg_rev['review'] = neg_rev['review'].apply(lambda x : re.sub(r"\W"," ",x)) #\W => not a word century's=> century s 

# remove the 's'
neg_rev['review'] = neg_rev['review'].apply(lambda x : re.sub(r"\b\w\b"," ",x)) 


# removing the puncatuation
neg_rev['review'] = neg_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))

# remove the stopwords
neg_rev['review'] = neg_rev['review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in stop_words]))


In [30]:
#Merge both pos and negatve reviews
com_rev = pos_rev.append(neg_rev,ignore_index=True)

In [31]:
com_rev.tail()

Unnamed: 0,review,mood
10657,terrible movie people nevertheless find moving,0
10658,many definitions time waster movie must surely...,0
10659,stands crocodile hunter hurried badly cobbled ...,0
10660,thing looks like made home video quickie,0
10661,enigma well made dry placid,0


In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values, com_rev['mood'], test_size=0.2 , random_state = 101)


In [35]:
# dataframe for test and train data

train_data = pd.DataFrame({'review':X_train, 'mood':y_train})
test_data = pd.DataFrame({'review':X_test, 'mood':y_test})


In [36]:
# tfidf vectorizer

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])


In [37]:
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

In [38]:
classifier = svm.SVC()
classifier.fit(train_vectors,train_data['mood'])

SVC()

In [39]:
pred = classifier.predict(test_vectors)
accuracy_score(pred, test_data['mood'])

0.7576183778715424

In [40]:
#SAVE THE MODEL
import joblib
joblib.dump(classifier,'75_netflix.pkl')
joblib.dump(vectorizer,'tfidf.pkl')

['tfidf.pkl']

In [41]:
#LOAD the MODEL
model = joblib.load('75_netflix.pkl')
tfidf = joblib.load('tfidf.pkl')

In [53]:
sent = input('Enter the review:')
vector = tfidf.transform([sent])
my_pred = model.predict(vector)
if my_pred[0] == 1:
    print('+ve')
elif my_pred[0] == 0:
    print('-ve')

Enter the review:bad
-ve


In [54]:
my_pred[0]

0