In [87]:
import pandas as pd
import numpy as np
import xgboost as xgb
#Feature importances
import matplotlib.pyplot as plt
from xgboost import plot_importance

In [91]:
# https://www.kaggle.com/code/datafan07/icr-simple-eda-baseline
def balance_logloss(y_true, y_pred):
    
    y_pred = np.stack([1-y_pred,y_pred]).T
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss



In [88]:
raw = pd.read_csv("./data/train.csv")
greeks = pd.read_csv("./data/greeks.csv")
test = pd.read_csv("./data/test.csv")

In [89]:
df = pd.merge(raw, greeks, on="Id")

In [90]:
df.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,2003.810319,22.136229,69.834944,0.120343,1,B,C,G,D,3/19/2019
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,27981.56275,29.13543,32.131996,21.978,0,A,C,M,B,Unknown
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,13676.95781,28.022851,35.192676,0.196941,0,A,C,M,B,Unknown
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,2094.262452,39.948656,90.493248,0.155829,0,A,C,M,B,Unknown
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8524.370502,45.381316,36.262628,0.096614,1,D,B,F,B,3/25/2020


In [92]:
# clean features
def clean_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.fillna(df.mean(numeric_only=True))
    df["EJ"] = np.where(df["EJ"] == "A", 1, 0)
    return df

In [93]:
# one hot encoding for "Alpha"
def one_hot_alpha(df: pd.DataFrame) -> pd.DataFrame:
    df["Alpha"] = df["Alpha"].astype("category")
    df = pd.get_dummies(df, columns=["Alpha"])
    return df



In [94]:
df  = df.pipe(clean_features).pipe(one_hot_alpha)


In [95]:
model = xgb.XGBClassifier(eval_metric="logloss")
X = df.drop(["Id", "Class", "Alpha_A", "Alpha_B", "Alpha_G", "Alpha_D", "Beta", "Gamma", "Delta", "Epsilon"], axis=1)  
y = df["Class"]

In [96]:
X

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.583450,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,11.626917,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,9.879296,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.910227,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,12.029366,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,8.026928,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622


In [97]:
model.fit(X, y)

In [98]:
# get values of feature importance
importance = model.get_booster().get_score(importance_type="weight")
importance_df = pd.DataFrame(
    sorted(importance.items(), key=lambda x: x[1], reverse=True)
)
importance_df.columns = ["feature", "fscore"]

In [99]:
important_features = importance_df.loc[importance_df["fscore"] > 10, "feature"].values

# Train with only important features

In [100]:
# Train model with important features
model_important = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=500,
    max_depth=3,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',  # if your problem is binary
    nthread=4,
    scale_pos_weight=1,
    seed=27,
    eval_metric="logloss"
)
X_important = df[important_features]
y_important = df["Class"]


In [101]:
print(f"size of features: {len(X_important.columns)}")
print(f"features: {X_important.columns}")
print(f"size of dataset: {len(df)}")


size of features: 22
features: Index(['DU', 'AB', 'BQ', 'CC', 'DA', 'CR', 'FR', 'FL', 'CD ', 'FI', 'AF', 'DN',
       'EE', 'DE', 'EL', 'EP', 'BC', 'DL', 'AM', 'CH', 'DY', 'EB'],
      dtype='object')
size of dataset: 617


In [102]:
model_important.fit(X_important, y_important)

# Submission

In [103]:
test = test.pipe(clean_features)

In [104]:
predictions = model.predict_proba(test.drop(["Id"], axis=1))

In [105]:
predictions

array([[0.8919803 , 0.10801969],
       [0.8919803 , 0.10801969],
       [0.8919803 , 0.10801969],
       [0.8919803 , 0.10801969],
       [0.8919803 , 0.10801969]], dtype=float32)

In [106]:
submission = pd.DataFrame({"Id": test["Id"], "class_0": predictions[:, 0], "class_1": predictions[:, 1]})

In [107]:
submission.to_csv("submission.csv", index=False)