In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/reviews-cleaned/reviews_cleaned.csv


# Women's Clothing Reviews - Sentiment Analysis: Building Word2Vec

## Overall Project Goal
In this project I want to understand this dataset on women's clothing reviews, create a Word2Vec model based off of the review texts in the data set and then use this Word2Vec model to build a sentiment analysis model on the dataset which attempts to predict whether the customer left a 5-star review or not.

### Project Notebooks
This notebook is part of a series of 3 notebooks on performing sentiment analysis on a dataset of women's clothing reviews.
1. Women's Clothing Reviews - Sentiment Analysis: EDA **{ADD LINKS}**
2. **Women's Clothing Reviews - Sentiment Analysis: Building Word2Vec Model** 
3. Women's Clothing Reviews - Sentiment Analysis: Building a Sentiment Analysis Model



## Goal of this notebook: Building Word2Vec Model
In this notebook, I want to take all the text in the dataset, preprocess it and use it to build a Word2Vec model that can then be used to vectorise the data and use it in a neural network for the task of sentiment analysis.

In [2]:
# Linear algebra and data processing
import pandas as pd
import numpy as np

# Data vizualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Data preprocessing
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# Importing the model
from gensim.models import word2vec, Phrases

In [3]:
data = pd.read_csv('/kaggle/input/reviews-cleaned/reviews_cleaned.csv', index_col='Unnamed: 0')

In [4]:
data

Unnamed: 0,Title,Review Text,Five Star
0,,Absolutely wonderful - silky and sexy and comf...,0
1,,Love this dress! it's sooo pretty. i happene...,1
2,Some major design flaws,I had such high hopes for this dress and reall...,0
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",1
4,Flattering shirt,This shirt is very flattering to all due to th...,1
...,...,...,...
22636,Great dress for many occasions,I was very happy to snag this dress at such a ...,1
22637,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",0
22638,"Cute, but see through","This fit well, but the top was very see throug...",0
22639,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,0


## 1. Data Preprocessing

In [5]:
# importing a lemmatizer
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize(words):
    # This function takes in a list of words and returns the list
    # of words with the words in their lemmatized form
    # it priotizes correctly lemmatizing verbs
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return lemmatized_words


In [6]:
def review_to_wordlist(review):
    
    # This is our data preprocessing function that removes HTML, removes non letters and 
    # returns the review with words split into items in a list and all in lowercase.
    
    # Remove HTML
    review = BeautifulSoup(review).get_text()
    
    # Remove non-letters
    review = re.sub("[^a-zA-Z]", " ", review)
    
    # Convert words to lowercase and split them
    review = review.lower().split()
    
    # Lemmatize the words
    review = lemmatize(review)
    
    return review

In [7]:
# Example of results of review_to_wordlist
review_to_wordlist(data['Review Text'][1])[:10]

['love', 'this', 'dress', 'it', 's', 'sooo', 'pretty', 'i', 'happen', 'to']

Word2Vec expects to recieve text in a specific format. It expects single sentences as a list of words. The input format is therefore a list of lists breaking down the text into sentences and words. We will use the tokenizer to define the different sentences in the text.

In [8]:
# Download the punkt tokenizer for sentence splitting
import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
def review_to_sentences(review, tokenizer):
    # This function splits the review into passed sentences
    # and returns a list of sentences, where each sentence is a list of words
    
    # Using the tokeniser to split the reivew into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            # Using the review_to_wordlist function to return a list of lists
            sentences.append(review_to_wordlist(raw_sentence))

    return sentences
        

In [10]:
# Replace missing values with a space character
data['Title'] = data['Title'].apply(lambda x: ' ' if pd.isna(x) else x)

# Creating a new column that contains the title and review text together
data['Total Text'] = data['Title'] + '. ' +data['Review Text'] 



In [11]:
# Defining the empty list to put the sentences in
sentences = []

# Preprocessing the reviews in `train_labeled` and appending to list
for i, review in enumerate(data['Total Text']):
    if i%5000 == 0:
        print(f'{i} reviews processed in data out of {len(data)}')
    sentences += review_to_sentences(review, tokenizer)

import warnings
warnings.filterwarnings('ignore')


0 reviews processed in data out of 22641




5000 reviews processed in data out of 22641
10000 reviews processed in data out of 22641
15000 reviews processed in data out of 22641
20000 reviews processed in data out of 22641


In [12]:
# Seeing how many sentences and words are in the corpus
num_sent = len(sentences)
num_words = sum(len(sentence) for sentence in sentences)

print(f'The dataset contains {num_sent:,} sentences and a total of {num_words:,} words')

The dataset contains 131,187 sentences and a total of 1,441,977 words


108,438 // 1,375,783 (numbers of sentences and words of review only)

## 2. Training the Word2Vec model

In [13]:
# Finding bigrams from the corpus
bigrams = Phrases(sentences=sentences)

In [14]:
# Finding trigrams from the corpus
trigrams = Phrases(sentences=bigrams[sentences])

In [15]:
# This is so we can see how the training is going
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
print(logging.getLogger().isEnabledFor(logging.INFO))

# Defining the model parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 30   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 20          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words


# Calling the model
model = word2vec.Word2Vec(
    sentences=trigrams[bigrams[sentences]], 
    workers=num_workers,
    vector_size=num_features, 
    min_count=min_word_count,
    window=context,
    sample=downsampling,
)

# Makes the model more memory efficient
model.init_sims(replace=True)

# Saving the model
model_name = "wcr_trigrams_300features_30minwords_20context" # 'wcr' stands for womens clothes reviews
model.save(model_name)

True


In [16]:
# Checking some outputs of the model
model.wv.most_similar('big')

[('large', 0.830851137638092),
 ('small', 0.775022029876709),
 ('tight', 0.7276830673217773),
 ('huge', 0.7088731527328491),
 ('snug', 0.7063732147216797),
 ('baggy', 0.6501529216766357),
 ('tight_across', 0.6367700099945068),
 ('roomy', 0.6163930296897888),
 ('loose', 0.5892308354377747),
 ('larger', 0.5643527507781982)]

# To Do:
- Read through and make sure I understand the notebook
- Organise the notebook appropiately and add comments and markdown where necessary
- Add an extension section
    - Should include the pros/ cons of adding another dataset to build the word2vec model
- Explore the resulting model a bit further
    - Look at most frequent words, least frequent words
    