# Bag of Words Meets Bags of Popcorn

[Kaggle Chanllenge](https://www.kaggle.com/c/word2vec-nlp-tutorial)
Use Google's Word2Vec for movie reviews

Deadline: 2019/01/05

In [1]:
from time import time
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression

## Import Cleaned Data

In [2]:
train = pd.read_csv( "Data/Word2Vec_clean_labeledTrainData.csv", index_col=0 )

unlabeled_train = pd.read_csv( "Data/Word2Vec_clean_unlabeledTrainData.csv", index_col=0 )

test = pd.read_csv( "Data/Word2Vec_clean_testData.csv", index_col=0 )

print("Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews.\n" 
      % (train["review"].size, 
         test["review"].size, 
         unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews.



In [3]:
train.head()

Unnamed: 0,review,id,score,sentiment
0,with all this stuff going down at the moment w...,"""5814_8""",8,1
1,the classic war of the worlds by timothy hines...,"""2381_9""",9,1
2,the film starts with a manager nicholas bell g...,"""7759_3""",3,0
3,it must be assumed that those who praised this...,"""3630_4""",4,0
4,superbly trashy and wondrously unpretentious s...,"""9495_8""",8,1


In [4]:
# Bag of Words
count_vectorizer = CountVectorizer(max_features=2500, ngram_range=(1, 2))

x_train = count_vectorizer.fit_transform(train.review.tolist()).toarray()
x_test = count_vectorizer.transform(test.review.tolist()).toarray()

### Logistic Regression

In [6]:
lr = LogisticRegression(C=0.2, dual=True)
lr.fit(x_train, y_train)

result = lr.predict(x_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, train.sentiment.tolist())

result = forest.predict(x_test)

### Linear SVC

In [None]:
from sklearn.svm import SVC, LinearSVC

linear_svc = LinearSVC()
linear_svc.fit(x_train, train.sentiment.tolist())

result = linear_svc.predict(x_test)

### Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB

multi_NB = MultinomialNB()
multi_NB.fit(x_train, train.sentiment.tolist())

result = multi_NB.predict(x_test)

In [None]:
output = pd.DataFrame(data = {"id": test['id'], "sentiment": result})

In [None]:
#output.to_csv("Results/Bag_of_Words_model_feature3k_bigram.csv", index=False, quoting=3)

### Check Model Accuracy

In [None]:
csv = pd.read_csv("Results/Bag_of_Words_model_feature3k_bigram.csv")

In [None]:
def classified_correct(model, i, inside=False):
    if inside:
        true_positive = model["sentiment"][i] == 0 and int(model["id"][i].split('"')[1].split("_")[1]) <= 5
        true_negative = model["sentiment"][i] == 1 and int(model["id"][i].split('"')[1].split("_")[1]) > 5
    
    else:
        true_positive = model["sentiment"][i] == 0 and int(model["id"][i].split("_")[1]) <= 5
        true_negative = model["sentiment"][i] == 1 and int(model["id"][i].split("_")[1]) > 5
    
    return true_positive or true_negative

target = output
correct = np.array([classified_correct(target, i, inside=True) for i in range(target.shape[0])])
print(correct.sum() / target.shape[0])

# 