In [1]:
import pandas as pd
import numpy as np
import json 
import gensim.downloader as api
from unidecode import unidecode
from googletrans import Translator

In [2]:
df = pd.read_csv('data/train-1.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,label
0,9,B001N2MZT8,903886718,Green Zone [DVD],N,Y,green zone,I found at first it was a little difficult to ...,2010-11-15,1,3,False
1,11,B00GCBVE0Q,282740618,Le secret de Green Knowe,N,Y,,J'ai aimé cette histoire. Les acteurs - et sur...,2014-11-23,2,3,False
2,19,1423165691,883799517,A Disney Sketchbook.,N,N,okay mais...,est-ce une coincidence que la plupart des prin...,2012-12-22,0,0,False


In [3]:
# Create dummy variables from the 'marketplace_id' column
dummy_df = pd.get_dummies(df['marketplace_id'], prefix='marketplace')

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, dummy_df], axis=1)

# Remove the 'marketplace_id' column from the DataFrame
df.drop('marketplace_id', axis=1, inplace=True)

In [4]:
with open('data/category.json', 'r') as file:
    category_data = json.load(file)

# Create a dictionary to map category IDs to category names
category_mapping = {category['id']: category['name'] for category in category_data}

# Map the integers in 'product_category_id' to category names
df['product_category_name'] = df['product_category_id'].map(category_mapping)

In [5]:
# may take a while. ~35s
model_glove_twitter = api.load("glove-twitter-25")

In [6]:
def get_average_word_embedding(category, word2vec_model):
    # Splitting by both spaces and underscores
    words = category.replace('_', ' ').split()
    words = [x.lower() for x in words]
    embeddings = []
    for word in words:
        if word in word2vec_model:
            embeddings.append(word2vec_model[word])
        else:
            print(f"Word not found in the model's vocabulary: {word}")
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        # Return a zero vector if none of the words were in the model's vocabulary
        return np.zeros(word2vec_model.vector_size)

df['product_category_embed'] = df['product_category_name'].apply(lambda x: get_average_word_embedding(x, model_glove_twitter))

In [8]:
df['review_body_decoded'] = df['review_body'].apply(unidecode)

In [26]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,product_category_id,label,marketplace_0,marketplace_1,marketplace_2,marketplace_3,product_category_name,product_category_embed,review_body_decoded
0,9,B001N2MZT8,903886718,Green Zone [DVD],N,Y,green zone,I found at first it was a little difficult to ...,2010-11-15,3,False,0,1,0,0,Video DVD,"[0.63741004, -0.15597649, 1.01626, -0.25933, 0...",I found at first it was a little difficult to ...
1,11,B00GCBVE0Q,282740618,Le secret de Green Knowe,N,Y,,J'ai aimé cette histoire. Les acteurs - et sur...,2014-11-23,3,False,0,0,1,0,Video DVD,"[0.63741004, -0.15597649, 1.01626, -0.25933, 0...",J'ai aime cette histoire. Les acteurs - et sur...
2,19,1423165691,883799517,A Disney Sketchbook.,N,N,okay mais...,est-ce une coincidence que la plupart des prin...,2012-12-22,0,False,1,0,0,0,Books,"[0.64268, 0.045608, 1.0344, -0.2208, 0.73695, ...",est-ce une coincidence que la plupart des prin...
3,33,0061091480,623343977,Your Erroneous Zones,N,N,Arrogant,Wáyné Dyér is á pớpúlár áméricán pérsớnál grớw...,2009-07-21,0,True,1,0,0,0,Books,"[0.64268, 0.045608, 1.0344, -0.2208, 0.73695, ...",Wayne Dyer is a popular american personal grow...
4,34,B00HZ4CYOY,647510225,König der Mathematik Junior,N,Y,Tớllé Máthé Ápp...,.....unsere Kids mögen diese Art des Lernens. ...,2015-06-01,1,False,1,0,0,0,Mobile_Apps,"[1.4242, 0.481895, -0.3044305, -0.924075, 1.27...",.....unsere Kids mogen diese Art des Lernens. ...


In [23]:
# Function to translate text using Google Translate API
def translate_text(text):
    translator = Translator()
    try:
        translation = translator.translate(text)
        return translation.text
    except TypeError:
        return np.nan

In [18]:
# look, we need to decode first for translation to work properly
print(df['review_body'][3])
print(translate_text(df['review_body'][3]))
translate_text(df['review_body_decoded'][3])

Wáyné Dyér is á pớpúlár áméricán pérsớnál grớwth áúthớr ánd Yớúr Érrớnéớús Zớnés wás his bréák thrớúgh bớớk. Pérsớnálly I dớn't knớw why hé bécámé sớ pớpúlár, I think his stylé is kind ớf árrớgánt ánd pátrớnizing ánd missés émpáthy fớr his réádérs. Máybé it's júst mé prớjécting ớn him, bút pérsớnálly I dớn't liké his vibés.
Wáyné Dyér also also at Périkán Pérrasớ at Grớwth Árethớr Ánd Yớúr Árrớné ớnés wás the brék thrớugh bớớk.Pérsớnály I dớn't knớw Why Hey Vilámé sớ pớpúlár, I think his stylé is kind ớf ride ánd Pátrớnizing Ánd émpáthy fớr his réádérs.Maybé's it's jsst mé prớjécting ớn hym, furnace pérsớnály i dớn't liké the vibés.


"Wayne Dyer is a popular american personal growth author and Your Erroneous Zones was his break through book. Personally I don't know why he became so popular, I think his style is kind of arrogant and patronizing and misses empathy for his readers. Maybe it's just me projecting on him, but personally I don't like his vibes."

In [24]:
# too much text currently.
# df['review_body_translated'] = df['review_body_decoded'].apply(translate_text)

ReadTimeout: The read operation timed out