# Amazon fine food review

In [None]:
import pandas as pd
import spacy
from gensim.models import Word2Vec

## Loading data

In [None]:
from datasets import load_dataset
dataset = load_dataset("jhan21/amazon-food-reviews-dataset")

In [None]:
dataset.set_format(type='pandas')
df = dataset['train'][:]
df

In [None]:
# Load the English model
nlp = spacy.load("en_core_web_md")

# Function to tokenize and lowercase sentences
def tokenize_and_lowercase(x, column_name):
    text = x[column_name]
    doc = nlp(text)
    return [[token.text.lower() for token in sent if not token.is_stop and not token.is_punct] for sent in doc.sents]


In [None]:
# Tokenize and lowercase the text and create list of tokenized sentences
df = df.sample(frac=0.5).copy()
df['tokenized'] = df.apply(lambda x: tokenize_and_lowercase(x, column_name='Text'), axis=1)

# Save the tokenized data
df.to_parquet('amazon-food-review-tokenized.parquet', engine='pyarrow')

In [None]:
# Load the tokenized data
df = pd.read_parquet('amazon-food-review-tokenized.parquet', engine='pyarrow')

# flatten the list of tokenized sentences
list_of_sentences = [list(tokens) for sublist in df['tokenized'].tolist() for tokens in sublist]

In [None]:
# Create and train the Word2Vec model
model_food = Word2Vec(list_of_sentences, vector_size=100, window=5, min_count=5, workers=4, epochs=5, sg=1)

# Save the word2vec model
model_food.save("word2vec_food_review.model")

In [None]:
# Load the word2vec model
#model_fod = Word2Vec.load("word2vec_food_review.model")

# Example usage
similar_words = model_food.wv.most_similar('disgusting', topn=5)
print(similar_words)