# Bag of Words Meets Bags of Popcorn

[Kaggle Chanllenge](https://www.kaggle.com/c/word2vec-nlp-tutorial)
Use Google's Word2Vec for movie reviews

Deadline: 2019/01/05

In [1]:
import time
import re

import numpy as np
import pandas as pd
import nltk

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

## Import Data

In [2]:
train_df = pd.read_csv( "Data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

unlabeled_train_df = pd.read_csv( "Data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

test_df = pd.read_csv( "Data/testData.tsv", header=0, delimiter="\t", quoting=3 )

### Preprocessing

In [17]:
def review_to_wordlist(review, remove_stopwords = False):
    
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return(words)

# Split paragraph into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords = False):
    
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:       # skip empty sentences
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
            
    return sentences

In [4]:
sentences = []

for review in train_df['review']:
    sentences += review_to_sentences(review, tokenizer)

for review in unlabeled_train_df['review']:
    sentences += review_to_sentences(review, tokenizer)

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


## Training

In [5]:
# Parameter values
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                   
downsampling = 1e-3   # Downsample setting for frequent words

In [6]:
from gensim.models import word2vec

word2vec_model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features, 
                                   min_count = min_word_count, window = context, sample = downsampling)

# If you don't plan to train the model any further, 
# calling init_sims will make the model much more memory-efficient.
word2vec_model.init_sims(replace=True)

model_name = "features300-minwords40-context10"
word2vec_model.save(model_name)

print("Number of words in the word2vec model vocabulary: %d" % len(word2vec_model.wv.vocab))

Number of words in the word2vec model vocabulary: 16490


## Clean reviews

In [34]:
clean_train_reviews = []
# Getting clean reviews from training set
counter = 0.
for review in train_df["review"]:
    clean_review = " ".join(review_to_wordlist(review))
    clean_train_reviews.append(clean_review)
    if counter % 5000. == 0.:
        print("Train review %d of %d" % (counter, len(train_df["review"])))
    counter = counter + 1.
    
clean_utrain_reviews = []
# Getting clean reviews from training set
counter = 0.
for review in unlabeled_train_df["review"]:
    clean_review = " ".join(review_to_wordlist(review))
    clean_utrain_reviews.append(clean_review)
    if counter % 5000. == 0.:
        print("Unlabeled Train review %d of %d" % (counter, len(unlabeled_train_df["review"])))
    counter = counter + 1.
    
clean_test_reviews = []
# Getting clean reviews from training set
counter = 0.
for review in test_df["review"]:
    clean_review = " ".join(review_to_wordlist(review))
    clean_test_reviews.append(clean_review)
    if counter % 5000. == 0.:
        print("Test review %d of %d" % (counter, len(test_df["review"])))
    counter = counter + 1.

Train review 0 of 25000
Train review 5000 of 25000
Train review 10000 of 25000
Train review 15000 of 25000
Train review 20000 of 25000
Unlabeled Train review 0 of 50000
Unlabeled Train review 5000 of 50000
Unlabeled Train review 10000 of 50000
Unlabeled Train review 15000 of 50000
Unlabeled Train review 20000 of 50000
Unlabeled Train review 25000 of 50000
Unlabeled Train review 30000 of 50000
Unlabeled Train review 35000 of 50000
Unlabeled Train review 40000 of 50000
Unlabeled Train review 45000 of 50000
Test review 0 of 25000
Test review 5000 of 25000
Test review 10000 of 25000
Test review 15000 of 25000
Test review 20000 of 25000


### Extract review raw score from id

In [12]:
def extract_score(id):
    return int(id[-2])

### Save cleaned data to csv

In [27]:
clean_df = pd.DataFrame(clean_train_reviews, columns=['review'])
clean_df['id'] = train_df.id
clean_df['score'] = train_df['id'].map(extract_score)
clean_df['sentiment'] = train_df.sentiment
clean_df.head()

Unnamed: 0,review,id,score,sentiment
0,with all this stuff going down at the moment w...,"""5814_8""",8,1
1,the classic war of the worlds by timothy hines...,"""2381_9""",9,1
2,the film starts with a manager nicholas bell g...,"""7759_3""",3,0
3,it must be assumed that those who praised this...,"""3630_4""",4,0
4,superbly trashy and wondrously unpretentious s...,"""9495_8""",8,1


In [28]:
clean_df.to_csv('Data/Word2Vec_clean_labeledTrainData.csv', encoding='utf-8')

In [29]:
clean_test_df = pd.DataFrame(clean_test_reviews, columns=['review'])
clean_test_df['id'] = test_df.id
clean_test_df['score'] = test_df['id'].map(extract_score)
clean_test_df.head()

Unnamed: 0,review,id,score
0,naturally in a film who s main themes are of m...,"""12311_10""",0
1,this movie is a disaster within a disaster fil...,"""8348_2""",2
2,all in all this is a movie for kids we saw it ...,"""5828_4""",4
3,afraid of the dark left me with the impression...,"""7186_2""",2
4,a very accurate depiction of small time mob li...,"""12128_7""",7


In [30]:
clean_test_df.to_csv('Data/Word2Vec_clean_testData.csv', encoding='utf-8')

In [32]:
clean_utrain_df = pd.DataFrame(clean_utrain_reviews, columns=['review'])
clean_utrain_df['id'] = unlabeled_train_df.id
clean_utrain_df.head()

Unnamed: 0,review,id
0,watching time chasers it obvious that it was m...,"""9999_0"""
1,i saw this film about years ago and remember i...,"""45057_0"""
2,minor spoilersin new york joan barnard elvire ...,"""15561_0"""
3,i went to see this film with a great deal of e...,"""7161_0"""
4,yes i agree with everyone on this site this mo...,"""43971_0"""


In [33]:
clean_test_df.to_csv('Data/Word2Vec_clean_unlabeledTrainData.csv', encoding='utf-8')