In [2]:
from __future__ import absolute_import, division, print_function
import pandas as pd
# load the data into panda dataframe
data_file_name = "Health_and_Personal_Care_5.json"
raw_df = pd.read_json(data_file_name, lines=True)
print("Data loaded")

Data loaded


In [3]:
# View information about the data
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346355 entries, 0 to 346354
Data columns (total 9 columns):
asin              346355 non-null object
helpful           346355 non-null object
overall           346355 non-null int64
reviewText        346355 non-null object
reviewTime        346355 non-null object
reviewerID        346355 non-null object
reviewerName      343304 non-null object
summary           346355 non-null object
unixReviewTime    346355 non-null int64
dtypes: int64(2), object(7)
memory usage: 26.4+ MB


In [4]:
# Convert all the review text into a long string and print its length
raw_corpus = u''.join(raw_df['reviewText']+" ")
print("Raw Corpus contains {0:,} characters".format(len(raw_corpus)))

Raw Corpus contains 178,581,273 characters


In [5]:
# import natural language toolkit
import nltk
# download the punkt tokenizer
nltk.download('punkt')
print("The punkt tokenizer is downloaded")

[nltk_data] Downloading package punkt to /Users/MacBook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The punkt tokenizer is downloaded


In [6]:
# Load the punkt tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
print("The punkt tokenizer is loaded")
# we tokenize the raw string into raw sentences
raw_sentences = tokenizer.tokenize(raw_corpus)
print("We have {0:,} raw sentences".format(len(raw_sentences)))

The punkt tokenizer is loaded
We have 1,824,643 raw sentences


In [7]:
import re
# Clean and split sentence into words
def clean_and_split_str(string):
    strip_special_chars = re.compile("[^A-Za-z]+")
    string = re.sub(strip_special_chars, " ", string)
    return string.strip().split()

In [8]:
# clean each raw sentences and build the list of sentences
sentences = []
for raw_sent in raw_sentences:
    if len(raw_sent) > 0:
        sentences.append(clean_and_split_str(raw_sent))
print("We have {0:,} clean sentences".format(len(sentences)))

We have 1,824,643 clean sentences


In [9]:
print(raw_sentences[30])
print()
print(sentences[30])

I use this magnifier to inspect seeds and leaves.

[u'I', u'use', u'this', u'magnifier', u'to', u'inspect', u'seeds', u'and', u'leaves']


In [10]:
token_count = sum([len(sentence) for sentence in sentences])
print("The dataset corpus contains {0:,} tokens".format(token_count))

The dataset corpus contains 33,476,197 tokens


In [12]:
import multiprocessing

#Dimensionality of the resulting word vectors
num_features = 300

#Minimum word count threshold
min_word_count = 3

#Number of threads to run in parallel
num_workers = multiprocessing.cpu_count()

#Context window length
context_size = 7

#Seed for the RNG, to make the result reproducible
seed = 1

In [13]:
import gensim

word2vec_model = gensim.models.word2vec.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers, 
    size=num_features, 
    min_count=min_word_count, 
    window=context_size)

In [14]:
word2vec_model.build_vocab(sentences=sentences)
print("The vocabulary is built")
print("Word2Vec vocabulary length: ", len(word2vec_model.vocab))

The vocabulary is built
Word2Vec vocabulary length:  62972


In [15]:
#Start training the model
word2vec_model.train(sentences=sentences)
print("Training finished")

Training finished


In [16]:
#Save the model
word2vec_model.save("word2vec_model_trained_on_Health_and_Personal_Care_5.w2v")
print("Model saved")

Model saved
