# Bag of Words Meets Bags of Popcorn

[Kaggle Chanllenge](https://www.kaggle.com/c/word2vec-nlp-tutorial)
Use Google's Word2Vec for movie reviews

Deadline: 2019/01/05

In [1]:
import time
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

## Import Data

In [3]:
train_df = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

unlabeled_train_df = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

test_df = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )

### Preprocessing

In [4]:
def review_to_words(raw_review, remove_stopwords = True):
    
    review_text = BeautifulSoup(raw_review).get_text()
    letters = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        meaningful_words = [w for w in words if not w in stops]
    
    return (" ".join(meaningful_words))

In [5]:
reviews_count = train_df['review'].size

clean_train_reviews = []

for i in range(0, reviews_count):
    clean_train_reviews.append(review_to_words(train_df['review'][i]))
    if( (i+1) % 5000 == 0 ):
        print("Review %d of %d\t" % (i+1, len(train_df)))

Review 5000 of 25000	
Review 10000 of 25000	
Review 15000 of 25000	
Review 20000 of 25000	
Review 25000 of 25000	


In [6]:
reviews_count = unlabeled_train_df['review'].size

clean_utrain_reviews = []

for i in range(0, reviews_count):
    clean_utrain_reviews.append(review_to_words(unlabeled_train_df['review'][i]))
    if( (i+1) % 5000 == 0 ):
        print("Review %d of %d\t" % (i+1, len(unlabeled_train_df)))

Review 5000 of 50000	
Review 10000 of 50000	
Review 15000 of 50000	
Review 20000 of 50000	
Review 25000 of 50000	
Review 30000 of 50000	
Review 35000 of 50000	
Review 40000 of 50000	
Review 45000 of 50000	
Review 50000 of 50000	


In [7]:
clean_test_reviews = [ ]

for i in range(0, len(test_df['review'])):
    clean_test_reviews.append(review_to_words(test_df['review'][i]))
    if( (i+1) % 5000 == 0 ):
        print("Review %d of %d\t" % (i+1, len(test_df)))

Review 5000 of 25000	
Review 10000 of 25000	
Review 15000 of 25000	
Review 20000 of 25000	
Review 25000 of 25000	


In [8]:
len(clean_utrain_reviews)

50000

### Extract review raw score from id

In [21]:
def extract_score(id):
    return int(id[-2])

### Save cleaned data to csv

In [28]:
clean_df = pd.DataFrame(clean_train_reviews, columns=['review'])
clean_df['id'] = train_df.id
clean_df['score'] = train_df['id'].map(extract_score)
clean_df['sentiment'] = train_df.sentiment
clean_df.head()

Unnamed: 0,review,id,score,sentiment
0,stuff going moment mj started listening music ...,"""5814_8""",8,1
1,classic war worlds timothy hines entertaining ...,"""2381_9""",9,1
2,film starts manager nicholas bell giving welco...,"""7759_3""",3,0
3,must assumed praised film greatest filmed oper...,"""3630_4""",4,0
4,superbly trashy wondrously unpretentious explo...,"""9495_8""",8,1


In [29]:
clean_df.to_csv('clean_labeledTrainData.csv', encoding='utf-8')

In [31]:
clean_test_df = pd.DataFrame(clean_test_reviews, columns=['review'])
clean_test_df['id'] = test_df.id
clean_test_df['score'] = test_df['id'].map(extract_score)
clean_test_df.head()

Unnamed: 0,review,id,score
0,naturally film main themes mortality nostalgia...,"""12311_10""",0
1,movie disaster within disaster film full great...,"""8348_2""",2
2,movie kids saw tonight child loved one point k...,"""5828_4""",4
3,afraid dark left impression several different ...,"""7186_2""",2
4,accurate depiction small time mob life filmed ...,"""12128_7""",7


In [32]:
clean_test_df.to_csv('clean_testData.csv', encoding='utf-8')

In [36]:
clean_utrain_df = pd.DataFrame(clean_utrain_reviews, columns=['review'])
clean_utrain_df['id'] = unlabeled_train_df.id
clean_utrain_df.head()

Unnamed: 0,review,id
0,watching time chasers obvious made bunch frien...,"""9999_0"""
1,saw film years ago remember particularly nasty...,"""45057_0"""
2,minor spoilersin new york joan barnard elvire ...,"""15561_0"""
3,went see film great deal excitement school dir...,"""7161_0"""
4,yes agree everyone site movie bad even call mo...,"""43971_0"""


In [37]:
clean_test_df.to_csv('clean_unlabeledTrainData.csv', encoding='utf-8')