## Code pour préparer les train/valid/test sets (poids égaux par classe de rating)
* Importer et buncher tous les produits
* Séparer en train/valid/test avec strate (sur rating, main_category ou sur categories??)
* Filtrer les valeurs aberrantes
* Enregistrer les données en 3 fichiers séparés

In [2]:
### Packages de base
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Importer et buncher tous les produits


In [3]:
## Loader les données
appliances = pd.read_csv('./../data/appliances.csv', )
cds_and_vinyl = pd.read_csv('./../data/cds_and_vinyl.csv')
#digital_music = pd.read_csv('./../data/digital_music.csv')
gift_cards = pd.read_csv('./../data/gift_cards.csv')
handmade_products = pd.read_csv('./../data/handmade_products.csv')
musical_instruments = pd.read_csv('./../data/musical_instruments.csv')
video_games = pd.read_csv('./../data/video_games.csv')


In [4]:
## Buncher ensemble
df_full = pd.concat([appliances, cds_and_vinyl, gift_cards, handmade_products, musical_instruments, video_games]) #, digital_music

In [5]:
## Ajouter cvolonne ID
df_full = df_full.drop(df_full.columns[0], axis=1)
df_full['ID'] = range(1, len(df_full)+1)

In [36]:
print(df_full.shape)
print(df_full.columns)

(9676194, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')


### Créer jeu de train
1 échantillon par modèle

#### Répliquer les splits précédent pour s'assurer que les données utilisées pour le train ne soient pas dans le test final

In [7]:
## Test set
train_temp, test = train_test_split(df_full, test_size=250000, random_state=3355, stratify=df_full['rating'])

In [9]:
## Valid sets
train, valid_temp = train_test_split(train_temp, test_size=300000, random_state=9999, stratify=train_temp['rating'])

#### Créer 5 jeux (1 par classe)

In [None]:
## Séparer par rating
train_1 = train[train['rating'] == 1]
train_2 = train[train['rating'] == 2]
train_3 = train[train['rating'] == 3]
train_4 = train[train['rating'] == 4]
train_5 = train[train['rating'] == 5]

In [None]:
## Séparer en XGBoost et Llama
train_1_xgb, train_1_llama = train_test_split(train_1, test_size=1000, random_state=1111, stratify=train_1['categories_grp'])
train_2_xgb, train_2_llama = train_test_split(train_2, test_size=1000, random_state=2222, stratify=train_2['categories_grp'])
train_3_xgb, train_3_llama = train_test_split(train_3, test_size=1000, random_state=3333, stratify=train_3['categories_grp'])
train_4_xgb, train_4_llama = train_test_split(train_4, test_size=1000, random_state=4444, stratify=train_4['categories_grp'])
train_5_xgb, train_5_llama = train_test_split(train_5, test_size=1000, random_state=5555, stratify=train_5['categories_grp'])

In [32]:
## 1 jeu par modèle
train_xgb = pd.concat([train_1_xgb, train_2_xgb, train_3_xgb, train_4_xgb, train_5_xgb])
train_xgb = train_xgb.sample(n=500000, replace=False, random_state=12345)

train_llama = pd.concat([train_1_llama, train_2_llama, train_3_llama, train_4_llama, train_5_llama])

In [33]:
### Assurance qualité
## XGB
print(train_xgb.shape)
print(train_xgb.columns)

print("-- rating --")
print(train_xgb['rating'].describe())
print("-- average_rating --")
print(train_xgb['average_rating'].describe())
print("-- rating_number --")
print(train_xgb['rating_number'].describe())

print("-- as_image --")
print(train_xgb['as_image'].describe())
print("-- as_helpful_vote --")
print(train_xgb['as_helpful_vote'].describe())

print("-- price --")
print(train_xgb['price'].describe())

print("-- main_category --")
print(train_xgb['main_category'].value_counts())
print("-- categories_grp --")
print(train_xgb['categories_grp'].value_counts())
print("-- verified_purchase --")
print(train_xgb['verified_purchase'].value_counts())

## Llama
print(train_llama.shape)
print(train_llama.columns)

print("-- rating --")
print(train_llama['rating'].describe())

(500000, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')
-- rating --
count    500000.000000
mean          4.354076
std           1.198334
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64
-- average_rating --
count    500000.000000
mean          4.513341
std           0.337004
min           1.000000
25%           4.400000
50%           4.600000
75%           4.700000
max           5.000000
Name: average_rating, dtype: float64
-- rating_number --
count    500000.000000
mean       3807.663646
std       15297.826360
min           1.000000
25%          84.000000
50%         417.000000
75%        1700.000000
max      261278.000000
Name: rating_number, dtype: float64
-- as_image --
count    500000.000000
mean  

In [37]:
## Conserver les variables importantes
train_llama_final = train_llama[['ID', 'rating', 'full_text']]
train_xgb_final = train_xgb[['ID', 'parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
                             'as_helpful_vote', 'verified_purchase', 'main_category',
                             'average_rating', 'rating_number', 'price', 'categories_grp']]

In [40]:
## Filtrer avant de saver
train_xgb_final = train_xgb_final[train_xgb_final['rating_number'] <= 25000]
train_xgb_final = train_xgb_final[train_xgb_final['helpful_vote'] <= 10]
train_xgb_final = train_xgb_final[train_xgb_final['price'] <= 500]

In [41]:
train_xgb_final.shape

(468416, 13)

In [42]:
## Enregistrer les échantillons de train (rating égaux en poids)
#train_llama_final.to_csv('./../data/train_llama_equal_weights.csv', index=False)
train_xgb_final.to_csv('./../data/train_xgb_equal_weights.csv', index=False)