In [52]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('Question2Dataset.csv')
df.head(10)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for ?1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [53]:
vectorizer = CountVectorizer(binary=True)
vectorizer.fit(df.review)
len(vectorizer.vocabulary_)
df_train, df_test = train_test_split(df, test_size=0.25, shuffle=True)
X_train = vectorizer.transform(df_train.review)
X_test = vectorizer.transform(df_test.review)
print(X_train.shape)
print(X_test.shape)

(18750, 74608)
(6250, 74608)


In [54]:
#BernoulliNB
clf = BernoulliNB().fit(X_train, df_train.sentiment)
np.exp(clf.class_log_prior_)
def top_20(classifier, vectorizer, categories=(' - review', ' + review')):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.feature_log_prob_[i])[-20:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

top_20(clf, vectorizer)

 - review: be have br as not on movie was with for but that in is it this to of and the
 + review: movie not film one br was on as but with for that it this in is to of and the


In [55]:
predicts = clf.predict(X_train)
print(classification_report(df_train.sentiment, predicts))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      9304
           1       0.94      0.88      0.91      9446

    accuracy                           0.91     18750
   macro avg       0.91      0.91      0.91     18750
weighted avg       0.91      0.91      0.91     18750



In [56]:
count_vect = CountVectorizer(binary=False).fit(df.review)
X_train_counts = count_vect.transform(df_train.review)
X_test_counts = count_vect.transform(df_test.review)

In [72]:
print(dict(zip(count_vect.inverse_transform(X_train_counts[0])[0], X_train_counts[0].data)))


{'10': 1, 'adventure': 1, 'alter': 1, 'altered': 1, 'apprehension': 1, 'bean': 1, 'better': 2, 'br': 2, 'brief': 1, 'commanding': 1, 'conclusion': 1, 'contrast': 1, 'cornwell': 1, 'costumes': 1, 'cowed': 1, 'despite': 1, 'determined': 1, 'drawn': 1, 'ego': 1, 'enjoyed': 1, 'flat': 1, 'great': 1, 'intelligent': 1, 'later': 1, 'literary': 1, 'locations': 1, 'muddled': 1, 'narrowly': 1, 'new': 1, 'novel': 1, 'officer': 4, 'pause': 1, 'photography': 1, 'piercing': 1, 'plot': 2, 'promoted': 1, 'pushes': 1, 'quick': 1, 'ranks': 1, 'rating': 1, 'read': 1, 'real': 1, 'really': 1, 'right': 1, 'role': 1, 'scene': 1, 'sean': 1, 'sets': 1, 'sharpe': 2, 'somewhat': 1, 'stare': 1, 'substantially': 1, 'surprised': 1, 'tone': 1, 'trials': 1, 'tripped': 1, 'unfortunately': 1, 'watched': 1}


In [58]:
#MultinomialNB

clf = MultinomialNB().fit(X_train_counts, df_train.sentiment)
predicts = clf.predict(X_train_counts)
print('----------------------------------Train-----------------------------')
print(classification_report(df_train.sentiment, predicts))
X_test_counts = count_vect.transform(df_test.review)
predicts = clf.predict(X_test_counts)
print('----------------------------------Test-----------------------------')
print(classification_report(df_test.sentiment, predicts))

----------------------------------Train-----------------------------
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      9304
           1       0.94      0.87      0.90      9446

    accuracy                           0.91     18750
   macro avg       0.91      0.91      0.91     18750
weighted avg       0.91      0.91      0.91     18750

----------------------------------Test-----------------------------
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      3196
           1       0.86      0.82      0.84      3054

    accuracy                           0.84      6250
   macro avg       0.85      0.84      0.84      6250
weighted avg       0.85      0.84      0.84      6250



In [59]:
top_20(clf, count_vect)

 - review: not on you film as with but for movie was that this in it is br to of and the
 + review: you on his movie but film was for with as this that it br in is to of and the


In [60]:

count_vect = CountVectorizer(stop_words='english', binary=False).fit(df.review)
X_train_counts = count_vect.transform(df_train.review)
X_test_counts = count_vect.transform(df_test.review)
clf = MultinomialNB().fit(X_train_counts, df_train.sentiment)
predicts = clf.predict(X_test_counts)
print(classification_report(df_test.sentiment, predicts))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      3196
           1       0.86      0.84      0.85      3054

    accuracy                           0.85      6250
   macro avg       0.85      0.85      0.85      6250
weighted avg       0.85      0.85      0.85      6250



In [61]:
top_20(clf, count_vect)

 - review: character characters think way movies acting plot make people story don really time good bad just like film movie br
 + review: character characters movies think films way life best love people really great time story just good like movie film br


In [62]:

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer = vectorizer.fit(df.review)
X_train_vectors = vectorizer.transform(df_train.review)
X_test_vectors = vectorizer.transform(df_test.review)
vectorizer.inverse_transform(X_train_vectors[0])[0][np.argsort(X_train_vectors[0].data)]

array(['really', 'great', 'scene', 'real', '10', 'new', 'br', 'right',
       'role', 'watched', 'later', 'read', 'unfortunately', 'despite',
       'enjoyed', 'somewhat', 'rating', 'sets', 'surprised', 'novel',
       'flat', 'intelligent', 'conclusion', 'tone', 'adventure', 'drawn',
       'costumes', 'photography', 'brief', 'plot', 'quick', 'better',
       'locations', 'contrast', 'sean', 'determined', 'ego', 'ranks',
       'muddled', 'literary', 'trials', 'stare', 'promoted', 'alter',
       'altered', 'pause', 'pushes', 'commanding', 'bean', 'piercing',
       'narrowly', 'substantially', 'tripped', 'apprehension', 'cornwell',
       'cowed', 'sharpe', 'officer'], dtype='<U66')

In [63]:
clf = MultinomialNB().fit(X_train_vectors, df_train.sentiment)
top_20(clf, vectorizer)

 - review: better think watch worst make people movies story plot acting time don really good like just bad film movie br
 + review: films way seen think watch movies life people best love really time just story like good great film movie br


In [64]:
predicts = clf.predict(X_train_vectors)
print('----------------------------------Train-----------------------------')
print(classification_report(df_train.sentiment, predicts))
X_test_vectors = vectorizer.transform(df_test.review)
predicts = clf.predict(X_test_vectors)
print('----------------------------------Tet-----------------------------')
print(classification_report(df_test.sentiment, predicts))

----------------------------------Train-----------------------------
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      9304
           1       0.93      0.91      0.92      9446

    accuracy                           0.92     18750
   macro avg       0.92      0.92      0.92     18750
weighted avg       0.92      0.92      0.92     18750

----------------------------------Tet-----------------------------
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      3196
           1       0.85      0.87      0.86      3054

    accuracy                           0.86      6250
   macro avg       0.86      0.86      0.86      6250
weighted avg       0.86      0.86      0.86      6250



In [69]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2)).fit(df.review)

X_train_vectors = vectorizer.transform(df_train.review)
X_test_vectors = vectorizer.transform(df_test.review)
vectorizer.inverse_transform(X_train_vectors[0])[0][np.argsort(X_train_vectors[0].data)]


array(['br br', 'really', 'great', 'scene', 'real', '10', 'new', 'br',
       'right', 'role', 'watched', 'later', 'read', 'unfortunately',
       'despite', 'enjoyed', 'somewhat', 'rating', 'sets', 'surprised',
       'novel', 'flat', 'intelligent', 'conclusion', 'tone', 'adventure',
       'drawn', 'costumes', 'photography', 'brief', 'plot', 'quick',
       'better', 'locations', 'contrast', 'sean', 'determined',
       'really enjoyed', 'rating 10', 'br rating', 'ego', 'ranks',
       'muddled', 'literary', 'trials', 'stare', 'promoted', 'alter',
       'altered', 'pause', 'pushes', 'commanding', 'bean', 'alter ego',
       'piercing', 'sets tone', 'plot somewhat', 'narrowly',
       'brief scene', 'commanding officer', 'substantially', 'tripped',
       'apprehension', 'great photography', 'sean bean', 'better read',
       'read watched', 'locations costumes', 'new role', 'flat plot',
       'better unfortunately', 'officer br', 'adventure better', 'cowed',
       'cornwell', 'cos

In [66]:
clf = MultinomialNB().fit(X_train_vectors, df_train.sentiment)

In [68]:
predicts = clf.predict(X_train_vectors)
print('----------------------------------Train-----------------------------')
print(classification_report(df_train.sentiment, predicts))
X_test_vectors = vectorizer.transform(df_test.review)
predicts = clf.predict(X_test_vectors)
print('----------------------------------Tet-----------------------------')
print(classification_report(df_test.sentiment, predicts))

----------------------------------Train-----------------------------
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9304
           1       0.99      0.99      0.99      9446

    accuracy                           0.99     18750
   macro avg       0.99      0.99      0.99     18750
weighted avg       0.99      0.99      0.99     18750

----------------------------------Tet-----------------------------
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      3196
           1       0.86      0.89      0.87      3054

    accuracy                           0.87      6250
   macro avg       0.87      0.88      0.87      6250
weighted avg       0.88      0.87      0.87      6250



In [None]:

predicted = clf.predict(vectorizer.transform(df_test.review))

pd.DataFrame({'Predicted': predicted}).to_csv('solution.csv', index_label='Id')