# Bag of Words Meets Bags of Popcorn

[Kaggle Chanllenge](https://www.kaggle.com/c/word2vec-nlp-tutorial)
Use Google's Word2Vec for movie reviews

Deadline: 2019/01/05

In [1]:
import time
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

## Import Data

In [2]:
train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )

### Preprocessing

In [3]:
def review_to_words(raw_review, remove_stopwords = True):
    
    review_text = BeautifulSoup(raw_review).get_text()
    letters = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        meaningful_words = [w for w in words if not w in stops]
    
    return (" ".join(meaningful_words))

In [4]:
reviews_count = train['review'].size

clean_train_reviews = []

for i in range(0, reviews_count):
    clean_train_reviews.append(review_to_words(train['review'][i]))
    if( (i+1) % 5000 == 0 ):
        print("Review %d of %d\t" % (i+1, len(train)))

Review 5000 of 25000	
Review 10000 of 25000	
Review 15000 of 25000	
Review 20000 of 25000	
Review 25000 of 25000	


In [5]:
clean_test_reviews = [ ]

for i in range(0, len(test['review'])):
    clean_test_reviews.append(review_to_words(test['review'][i]))
    if( (i+1) % 5000 == 0 ):
        print("Review %d of %d\t" % (i+1, len(test)))

Review 5000 of 25000	
Review 10000 of 25000	
Review 15000 of 25000	
Review 20000 of 25000	
Review 25000 of 25000	


In [9]:
len(clean_train_reviews)

25000

## TF-IDF: Term Frequency–Inverse Document Frequency
  
詞彙的重要程度：在文件中出現頻率高，而在整個文件集合的其他文件中出現頻率少

### fit( )
渡されたデータの最大値、最小値、平均、標準偏差、傾き...などの統計を取得して、内部メモリに保存する。

### transform( )
fit( )で取得した統計情報を使って、渡されたデータを実際に書き換える。

### 方法一：CountVectorizer + TfidfTransformer

In [None]:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(vectorizer.fit_transform(clean_train_reviews))

In [None]:
tfidf = tfidf.toarray()

# tfidf.shape = (25000, 74047)

In [None]:
test_data_features = transformer.fit_transform(vectorizer.fit_transform(clean_train_reviews))
test_data_features = test_data_features.toarray()

### 方法二：直接用 TfidfVectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer()
tfidf2 = vectorizer_tfidf.fit_transform(clean_train_reviews).toarray()

In [None]:
tfidf2

## Random Forest (Supervised Learning)

More trees map perform better, but certainly take longer to run.

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(tfidf, train['sentiment'])

In [None]:
classifier = forest
result = forest.predict(test_data_features)

In [None]:
# output = pd.DataFrame(data = {"id": test['id'], "sentiment": result})
# output.to_csv("Result/tfidf_and_count.csv", index=False, quoting=3)

In [None]:
forest2 = forest.fit(tfidf2, train['sentiment'])

In [None]:
classifier = forest2
result = forest2.predict(test_data_features)

In [None]:
# output = pd.DataFrame(data = {"id": test['id'], "sentiment": result})
# output.to_csv("Result/tfidf2.csv", index=False, quoting=3)

### 方法三：TFIDF + SVM

In [10]:
vectorizer = CountVectorizer(analyzer = 'word', max_features = 2500)

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

test_data_features = vectorizer.fit_transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [25]:
tfidf_transformer = TfidfTransformer().fit(train_data_features)

train_tfidf = tfidf_transformer.transform(train_data_features)
test_tfidf = tfidf_transformer.transform(test_data_features)

In [26]:
from sklearn.svm import SVC, LinearSVC

linear_svc = LinearSVC()
linear_svc.fit(train_tfidf, train['sentiment'])

result = linear_svc.predict(test_tfidf)

### Accuracy

In [37]:
acc_linear_svc = round(linear_svc.score(train_tfidf, train['sentiment']) * 100, 2)

acc_linear_svc

91.43

In [38]:
# output = pd.DataFrame(data = {"id": test['id'], "sentiment": result})
# output.to_csv("TFIDF_SVM.csv", index=False, quoting=3)