## Train a Word2Vec model

In [9]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import gutenberg
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [2]:
import pandas as pd

# Define the path to your text document
file_path = "foods.txt"

# Read the text document
with open(file_path, "r") as file:
    lines = file.readlines()

# Initialize lists to store data
product_ids = []
user_ids = []
profile_names = []
helpfulness = []
scores = []
times = []
summaries = []
texts = []

# Iterate over the lines and extract information for each entry
entry = {}
for line in lines:
    line = line.strip()
    if line:
        if ": " in line:
            key, value = line.split(": ", 1)
            entry[key] = value
    else:
        product_ids.append(entry.get("product/productId", ""))
        user_ids.append(entry.get("review/userId", ""))
        profile_names.append(entry.get("review/profileName", ""))
        helpfulness.append(entry.get("review/helpfulness", ""))
        scores.append(entry.get("review/score", ""))
        times.append(entry.get("review/time", ""))
        summaries.append(entry.get("review/summary", ""))
        texts.append(entry.get("review/text", ""))
        entry = {}

# Create a DataFrame from the extracted data
data = {
    "product/productId": product_ids,
    "review/userId": user_ids,
    "review/profileName": profile_names,
    "review/helpfulness": helpfulness,
    "review/score": scores,
    "review/time": times,
    "review/summary": summaries,
    "review/text": texts
}

df = pd.DataFrame(data)

Unnamed: 0,product/productId,review/userId,review/profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1/1,5.0,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0/0,1.0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1/1,4.0,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3/3,2.0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0/0,5.0,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...
568449,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0/0,5.0,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0/0,2.0,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2/2,5.0,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1/1,5.0,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [6]:
reviewText = df["review/text"]
reviewText

0         I have bought several of the Vitality canned d...
1         Product arrived labeled as Jumbo Salted Peanut...
2         This is a confection that has been around a fe...
3         If you are looking for the secret ingredient i...
4         Great taffy at a great price.  There was a wid...
                                ...                        
568449    Great for sesame chicken..this is a good if no...
568450    I'm disappointed with the flavor. The chocolat...
568451    These stars are small, so you can give 10-15 o...
568452    These are the BEST treats for training and rew...
568453    I am very satisfied ,product is as advertised,...
Name: review/text, Length: 568454, dtype: object

In [7]:
# load corpus
corpus = reviewText
sents = []

In [10]:
print("Sentences prior to pre-processing:")
print(corpus[:10])

# tokenising, normalising, cleaning and lowercase sentences
for sent in corpus:
    sents.append(gensim.utils.simple_preprocess(str(sent)))

print("\nSentences after pre-processing:")
print(sents[:10])

Sentences prior to pre-processing:
0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
5    I got a wild hair for taffy and ordered this f...
6    This saltwater taffy had great flavors and was...
7    This taffy is so good.  It is very soft and ch...
8    Right now I'm mostly just sprouting this so my...
9    This is a very healthy dog food. Good for thei...
Name: review/text, dtype: object

Sentences after pre-processing:
[['have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', 'the', 'product', 'looks', 'more', 'like', 'stew', 'than', 'processed', 'meat', 'and', 'it', 'smells', 'better', 'my', 'labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'th

In [11]:
# model parameters
corpusName = "amazonFoodsReview"
windowsCount = 10
minCount = 2
workersCount = 6

# define Word2Vec model
model = gensim.models.Word2Vec(
    window=windowsCount,
    min_count=minCount,
    workers=workersCount
)

model.build_vocab(sents, progress_per=1000)

In [12]:
# train & save the model
model.train(sents, total_examples=model.corpus_count, epochs=model.epochs)
model.save(f"{corpusName}_W2V_{windowsCount}_{minCount}_{workersCount}")

## Evaluating the model

In [19]:
model.wv.similarity('horrendous', 'horrible')

0.7497167

In [20]:
model.wv.similar_by_word("good")

[('decent', 0.7581295371055603),
 ('great', 0.7506219744682312),
 ('terrific', 0.6600396037101746),
 ('fantastic', 0.6587141156196594),
 ('bad', 0.6501237750053406),
 ('tasty', 0.62489253282547),
 ('nice', 0.6247112154960632),
 ('yummy', 0.5859358906745911),
 ('phenomenal', 0.58575040102005),
 ('awesome', 0.5789355635643005)]