## Code pour préparer les train/valid/test sets
* Importer et buncher tous les produits
* Séparer en train/valid/test avec strate (sur rating, main_category ou sur categories??)
* Filtrer les valeurs aberrantes
* Enregistrer les données en 3 fichiers séparés

In [103]:
### Packages de base
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Importer et buncher tous les produits


In [104]:
## Loader les données
appliances = pd.read_csv('./../data/appliances.csv', )
cds_and_vinyl = pd.read_csv('./../data/cds_and_vinyl.csv')
#digital_music = pd.read_csv('./../data/digital_music.csv')
gift_cards = pd.read_csv('./../data/gift_cards.csv')
handmade_products = pd.read_csv('./../data/handmade_products.csv')
musical_instruments = pd.read_csv('./../data/musical_instruments.csv')
video_games = pd.read_csv('./../data/video_games.csv')


In [105]:
## Buncher ensemble
df_full = pd.concat([appliances, cds_and_vinyl, gift_cards, handmade_products, musical_instruments, video_games]) #, digital_music

In [106]:
## Ajouter cvolonne ID
df_full = df_full.drop(df_full.columns[0], axis=1)
df_full['ID'] = range(1, len(df_full)+1)

In [52]:
print(df_full.shape)
print(df_full.columns)

(9676194, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')


In [53]:
df_full['main_category'].value_counts()

Digital Music               4210454
Video Games                 2078246
Musical Instruments         1788145
Tools & Home Improvement     610495
Handmade                     496372
Appliances                   413824
Gift Cards                    78658
Name: main_category, dtype: int64

In [54]:
df_full['categories_grp'].value_counts()

Other                     3534024
Games                     1138254
Pop                        958156
Accessories                939992
Instrument Accessories     780629
Country                    653421
Rock                       646734
Classical                  477624
Looks                      218236
Home & Kitchen             206643
Parts & Accessories         90361
Clothing                    12658
Restaurants                 11071
Specialty Cards              5433
Office-Gaming                2958
Name: categories_grp, dtype: int64

In [55]:
df_full['rating'].value_counts()

5.0    6807164
4.0    1232124
1.0     706686
3.0     581437
2.0     348783
Name: rating, dtype: int64

### Séparer en train/valid/test split

#### Test
Cet échantillon est pour le test final (jamais vu auparavantr par aucun des 2 modèles) [250 000]

In [107]:
train_temp, test = train_test_split(df_full, test_size=250000, random_state=3355, stratify=df_full['rating'])

In [80]:
test['main_category'].value_counts()

Digital Music               108832
Video Games                  53810
Musical Instruments          45918
Tools & Home Improvement     15842
Handmade                     12864
Appliances                   10642
Gift Cards                    2092
Name: main_category, dtype: int64

In [81]:
test['categories_grp'].value_counts()

Other                     91435
Games                     29341
Pop                       24671
Accessories               24469
Instrument Accessories    19901
Country                   16892
Rock                      16656
Classical                 12416
Looks                      5565
Home & Kitchen             5351
Parts & Accessories        2438
Clothing                    357
Restaurants                 280
Specialty Cards             149
Office-Gaming                79
Name: categories_grp, dtype: int64

In [82]:
test['rating'].value_counts()

5.0    175874
4.0     31834
1.0     18258
3.0     15022
2.0      9012
Name: rating, dtype: int64

In [83]:
### Assurance qualité
print(test.columns)

print("-- rating --")
print(test['rating'].describe())
print("-- average_rating --")
print(test['average_rating'].describe())
print("-- rating_number --")
print(test['rating_number'].describe())

print("-- as_image --")
print(test['as_image'].describe())
print("-- as_helpful_vote --")
print(test['as_helpful_vote'].describe())

print("-- price --")
print(test['price'].describe())

print("-- main_category --")
print(test['main_category'].value_counts())
print("-- categories_grp --")
print(test['categories_grp'].value_counts())
print("-- verified_purchase --")
print(test['verified_purchase'].value_counts())

Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')
-- rating --
count    250000.000000
mean          4.352216
std           1.200422
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64
-- average_rating --
count    250000.000000
mean          4.512680
std           0.337397
min           1.000000
25%           4.400000
50%           4.600000
75%           4.700000
max           5.000000
Name: average_rating, dtype: float64
-- rating_number --
count    250000.000000
mean       3820.423228
std       15264.747721
min           1.000000
25%          85.000000
50%         417.000000
75%        1691.000000
max      261278.000000
Name: rating_number, dtype: float64
-- as_image --
count    250000.000000
mean          0.030

In [92]:
## Conserver les variables importantes
test_final = test[['ID', 'parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
                             'as_helpful_vote', 'verified_purchase', 'main_category',
                             'average_rating', 'rating_number', 'price', 'categories_grp']]

In [93]:
## Enregistrer l'échantillon de test
test_final.to_csv('./../data/test.csv', index=False)

#### Valid
1 échantillon de valid par modèle [150 000]

In [85]:
print(train_temp.shape)
print(train_temp.columns)

(9426194, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')


In [108]:
train, valid_temp = train_test_split(train_temp, test_size=300000, random_state=9999, stratify=train_temp['rating'])

In [109]:
valid_llama, valid_xgb = train_test_split(valid_temp, test_size=0.5, random_state=1234, stratify=valid_temp['rating'])

In [89]:
### Assurance qualité
print(valid_xgb.shape)
print(valid_xgb.columns)

print(valid_llama.shape)
print(valid_llama.columns)

(150000, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')
(150000, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')


In [90]:
## Conserver les variables importantes
valid_llama_final = valid_llama[['ID', 'rating', 'full_text']]
valid_xgb_final = valid_xgb[['ID', 'parent_asin', 'rating', 'as_image', 'helpful_vote',
                             'as_helpful_vote', 'verified_purchase', 'main_category',
                             'average_rating', 'rating_number', 'price', 'categories_grp']]

In [91]:
## Enregistrer les échantillons de valid
valid_llama_final.to_csv('./../data/valid_llama.csv', index=False)
valid_xgb_final.to_csv('./../data/valid_xgb.csv', index=False)

#### Train
1 échantillon par modèle [500 000]

In [94]:
print(train.shape)
print(train.columns)

(9126194, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')


In [130]:
train_both = train.sample(n=105000, replace=False, random_state=2024)

In [131]:
### Assurance qualité
print(train_both.shape)
print(train_both.columns)

print("-- rating --")
print(train_both['rating'].describe())
print("-- average_rating --")
print(train_both['average_rating'].describe())
print("-- rating_number --")
print(train_both['rating_number'].describe())

print("-- as_image --")
print(train_both['as_image'].describe())
print("-- as_helpful_vote --")
print(train_both['as_helpful_vote'].describe())

print("-- price --")
print(train_both['price'].describe())

print("-- main_category --")
print(train_both['main_category'].value_counts())
print("-- categories_grp --")
print(train_both['categories_grp'].value_counts())
print("-- verified_purchase --")
print(train_both['verified_purchase'].value_counts())

(105000, 13)
Index(['parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
       'as_helpful_vote', 'verified_purchase', 'main_category',
       'average_rating', 'rating_number', 'price', 'categories_grp', 'ID'],
      dtype='object')
-- rating --
count    105000.000000
mean          4.356400
std           1.194185
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64
-- average_rating --
count    105000.000000
mean          4.514236
std           0.334233
min           1.000000
25%           4.400000
50%           4.600000
75%           4.700000
max           5.000000
Name: average_rating, dtype: float64
-- rating_number --
count    105000.000000
mean       3718.954286
std       14567.312838
min           1.000000
25%          83.000000
50%         416.000000
75%        1704.000000
max      261278.000000
Name: rating_number, dtype: float64
-- as_image --
count    105000.000000
mean  

In [132]:
train_xgb, train_llama = train_test_split(train_both, test_size=50000, random_state=7190, stratify=train_both['rating'])

In [133]:
print(train_xgb.shape)
print(train_llama.shape)

(55000, 13)
(50000, 13)


In [134]:
## Conserver les variables importantes
train_llama_final = train_llama[['ID', 'rating', 'full_text']]
train_xgb_final = train_xgb[['ID', 'parent_asin', 'rating', 'full_text', 'as_image', 'helpful_vote',
                             'as_helpful_vote', 'verified_purchase', 'main_category',
                             'average_rating', 'rating_number', 'price', 'categories_grp']]

In [135]:
## Filtrer avant de saver
train_xgb_final = train_xgb_final[train_xgb_final['rating_number'] <= 25000]
train_xgb_final = train_xgb_final[train_xgb_final['helpful_vote'] <= 10]
train_xgb_final = train_xgb_final[train_xgb_final['price'] <= 500]

In [136]:
train_xgb_final.shape

(51541, 13)

In [137]:
## Enregistrer les échantillons de valid
train_llama_final.to_csv('./../data/train_llama.csv', index=False)
train_xgb_final.to_csv('./../data/train_xgb.csv', index=False)