# Hackathon Vestiaire Collective

> Project summary
- Import libraries
- 1: EDA & CORR
- 2: PREPROCESSING
- 3: BUILD BASELINE
- 4: XGBOOST
- 5: LGBM
- 6: Submission

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [11]:
data = pd.read_csv('train.csv',index_col=0)

In [12]:
data = data[data.TIME_ONLINE < 1600]

In [13]:
features = data.columns.values[:-1]
target = data.columns.values[-1]

In [14]:
# Label list
features_label = data.columns.to_list()
target_label = features_label.pop(features_label.index('LABEL'))
target_label = [target_label]

numerical_label = features_label.copy()

categorical_label = numerical_label.pop(numerical_label.index('ID_PRODUCT'))
categorical_label = [categorical_label]
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SELLER')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SELLER_COUNTRY')))
categorical_label.append(numerical_label.pop(numerical_label.index('SELLER_GEO_1')))
categorical_label.append(numerical_label.pop(numerical_label.index('SELLER_GEO_2')))
categorical_label.append(numerical_label.pop(numerical_label.index('SELLER_GEO_3')))
categorical_label.append(numerical_label.pop(numerical_label.index('SEGMENT')))
categorical_label.append(numerical_label.pop(numerical_label.index('BRAND_GROUP')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_BRAND')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_PAGE')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SITE')))
categorical_label.append(numerical_label.pop(numerical_label.index('LANGUAGE')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_UNIVERSE')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_CATEGORY')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SUB_SUBCATEGORY')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_MODEL')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_MATERIAL')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_COLOUR')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_PATTERN')))
categorical_label.append(numerical_label.pop(numerical_label.index('CURRENCY')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_CONDITION')))
categorical_label.append(numerical_label.pop(numerical_label.index('DEPOSIT_DEVICE')))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42, stratify=Y)

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocess = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_label),
        ('cat', cat_transformer, categorical_label)
    ])
X_train = preprocess.fit_transform(X_train)

In [None]:
X_test = preprocess.transform(X_test)

# Baseline

In [None]:
classifier = LogisticRegression(max_iter=100000) 
classifier.fit(X_train, Y_train)

In [None]:
Y_train_pred = classifier.predict_proba(X_train)
Y_test_pred = classifier.predict_proba(X_test)

In [None]:
print("average-precision-score on train set : ", average_precision_score(Y_train, Y_train_pred[:,1]))
print("average-precision-score on test set : ", average_precision_score(Y_test, Y_test_pred[:,1]))

# XGBOOST

In [None]:
model = XGBClassifier()
model.fit(X_train,Y_train)
Y_train_pred = model.predict_proba(X_train)
Y_test_pred = model.predict_proba(X_test)
print("average-precision-score on train set : ", average_precision_score(Y_train, Y_train_pred[:,1]))
print("average-precision-score on test set : ", average_precision_score(Y_test, Y_test_pred[:,1]))

# LGBM 

In [None]:
PARAMETERS = {"subsample":[0.5, 0.75, 1],
              "colsample_bytree":[0.5, 0.75, 1],
              "max_depth":[2, 6, 12],
              "min_child_weight":[1,5,15],
              "learning_rate":[0.3, 0.1, 0.03],
              "n_estimators":[100]}


model = LGBMClassifier()
model_gs = GridSearchCV(model,param_grid=PARAMETERS,cv=3,scoring="accuracy",verbose=3, n_jobs=-1)
model_gs.fit(X_train,Y_train)

In [None]:
grid_lgb_train_pred = model_gs.predict_proba(X_train)
grid_lgb_test_pred = model_gs.predict_proba(X_test)

In [None]:
print("average-precision-score on train set : ", average_precision_score(Y_train, grid_lgb_train_pred[:,1]))
print("average-precision-score on test set : ", average_precision_score(Y_test, grid_lgb_test_pred[:,1]))

# Best model : LGBM, fit on all dataset

In [15]:
#X = data[numerical_label]
#Y =  data['LABEL']
X = data.loc[:, features]
Y = data.loc[:, target]

In [16]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocess = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_label),
        ('cat', cat_transformer, categorical_label)
    ])
X = preprocess.fit_transform(X)

In [None]:
LCLF = LGBMClassifier()
LCLF.fit(X, Y)

# Submission

In [18]:
test_sub = pd.read_csv('test.csv',index_col=0)
test_sub = preprocess.transform(test_sub)
test_sub_pred = LCLF.predict_proba(test_sub)[:,1]
# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('sub/submission3.csv',index=False)