In [8]:
# Imports
import optuna
import pandas as pd
import lightgbm as lgb
import xgboost as xgb

from sklearn import base
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold, cross_validate, KFold

In [22]:
# Get train and test set
train = pd.read_csv('train_ml2_2021.csv')
test = pd.read_csv('test_ML.csv')

In [23]:
# Combine problem_id and target and set "-1" placeholder for 'fold' column
train['combined_target'] = train.problem_id.astype(str) + train.target.astype(str)
train['fold'] = -1

In [24]:
train

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v971,v972,v973,v974,v975,v976,v977,target,combined_target,fold
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,0.56,0.40,0.75,0.08,0.14,0.43,0.88,0,00,-1
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.06,0.17,0.09,0.04,0.04,0.45,0.14,1,01,-1
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.57,0.58,0.37,0.01,0.10,0.38,0.06,1,01,-1
3,0,0.70,0.20,0.62,0.41,0.41,0.10,0.80,0.52,0.82,...,0.25,0.07,0.23,0.04,0.76,0.41,0.59,1,01,-1
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.16,0.03,0.21,0.10,0.12,0.53,0.30,1,01,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8297,20,0.91,0.76,0.45,0.81,0.99,0.20,0.36,0.28,0.02,...,0.86,0.84,0.86,0.52,0.04,0.77,0.68,0,200,-1
8298,20,0.31,0.93,0.45,0.61,0.99,0.09,0.73,0.25,0.56,...,0.61,0.65,0.65,0.31,0.06,0.89,0.11,0,200,-1
8299,20,0.92,0.82,0.52,0.61,0.94,0.67,0.65,0.34,0.25,...,0.94,0.76,0.61,0.72,0.14,0.11,0.91,0,200,-1
8300,20,0.93,0.85,0.51,0.64,0.95,0.76,0.76,0.36,0.07,...,0.24,0.89,0.84,0.54,0.13,0.91,0.95,0,200,-1


In [12]:
# StratifiedKFold which serves as cross validation - in this case split into 5 different groups
skf = StratifiedKFold(n_splits=5, shuffle=True,  random_state=42)

for fold_id, (train_idx, valid_idx) in enumerate(skf.split(train, train.combined_target)):
    train.loc[valid_idx, 'fold'] = fold_id

In [13]:
train

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v971,v972,v973,v974,v975,v976,v977,target,combined_target,fold
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,0.56,0.40,0.75,0.08,0.14,0.43,0.88,0,00,4
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.06,0.17,0.09,0.04,0.04,0.45,0.14,1,01,1
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.57,0.58,0.37,0.01,0.10,0.38,0.06,1,01,1
3,0,0.70,0.20,0.62,0.41,0.41,0.10,0.80,0.52,0.82,...,0.25,0.07,0.23,0.04,0.76,0.41,0.59,1,01,4
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.16,0.03,0.21,0.10,0.12,0.53,0.30,1,01,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8297,20,0.91,0.76,0.45,0.81,0.99,0.20,0.36,0.28,0.02,...,0.86,0.84,0.86,0.52,0.04,0.77,0.68,0,200,3
8298,20,0.31,0.93,0.45,0.61,0.99,0.09,0.73,0.25,0.56,...,0.61,0.65,0.65,0.31,0.06,0.89,0.11,0,200,4
8299,20,0.92,0.82,0.52,0.61,0.94,0.67,0.65,0.34,0.25,...,0.94,0.76,0.61,0.72,0.14,0.11,0.91,0,200,4
8300,20,0.93,0.85,0.51,0.64,0.95,0.76,0.76,0.36,0.07,...,0.24,0.89,0.84,0.54,0.13,0.91,0.95,0,200,2


In [14]:
# Drop the un-needed 'combined_target' column
train.drop('combined_target', axis=1, inplace=True)

In [15]:
# Split into test sets
test_X, test_y = test.iloc[:,:-1], test.target

In [16]:
# Look at the data - Imbalaced
train['target'].value_counts()

0    4124
1    3175
2     729
3     255
4      19
Name: target, dtype: int64

- Imbalanced multiclass target

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8302 entries, 0 to 8301
Columns: 981 entries, problem_id to fold
dtypes: float64(978), int64(3)
memory usage: 62.1 MB


In [18]:
# Split into training sets
X, y = train.iloc[:,:-1], train.target

In [21]:
def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    
    rf_max_depth = trial.suggest_int('rf_max_depth', 8, 64)
    rf_min_leaf = trial.suggest_int('rf_min_leaf', 1, 5)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 0, 400, step=100)
    clf= RandomForestClassifier(max_depth=rf_max_depth, min_samples_leaf=rf_min_leaf, n_estimators=rf_n_estimators, class_weight='balanced' ,n_jobs=-1)


    cv = StratifiedKFold(n_splits=5, shuffle=True)
    
    scoring = ['accuracy', 'neg_log_loss']
    scores = cross_validate(clf, X, y, cv=cv, n_jobs=-1, scoring=scoring)

    accuracy_score = scores['test_accuracy'].mean()


    return accuracy_score  # An objective value linked with the Trial object.

study = optuna.create_study(direction='maximize')  # Create a new study.
study.optimize(objective, n_trials=15)  # Invoke optimization of the objective function.

[32m[I 2021-07-14 01:49:42,728][0m A new study created in memory with name: no-name-d999d21e-cac1-4e12-8ce2-971711b9d99a[0m
[32m[I 2021-07-14 01:50:11,696][0m Trial 0 finished with value: 0.9416987153913666 and parameters: {'rf_max_depth': 53, 'rf_min_leaf': 5, 'rf_n_estimators': 300}. Best is trial 0 with value: 0.9416987153913666.[0m
[32m[I 2021-07-14 01:50:32,868][0m Trial 1 finished with value: 0.9173697076082779 and parameters: {'rf_max_depth': 43, 'rf_min_leaf': 2, 'rf_n_estimators': 200}. Best is trial 0 with value: 0.9416987153913666.[0m
[32m[I 2021-07-14 01:50:52,249][0m Trial 2 finished with value: 0.9355583441532536 and parameters: {'rf_max_depth': 55, 'rf_min_leaf': 5, 'rf_n_estimators': 200}. Best is trial 0 with value: 0.9416987153913666.[0m
[32m[I 2021-07-14 01:51:11,249][0m Trial 3 finished with value: 0.9268844432516339 and parameters: {'rf_max_depth': 9, 'rf_min_leaf': 4, 'rf_n_estimators': 300}. Best is trial 0 with value: 0.9416987153913666.[0m
[33m[

In [45]:
train

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v970,v971,v972,v973,v974,v975,v976,v977,target,fold
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,0.57,0.56,0.40,0.75,0.08,0.14,0.43,0.88,0,4
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.59,0.06,0.17,0.09,0.04,0.04,0.45,0.14,1,2
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.42,0.57,0.58,0.37,0.01,0.10,0.38,0.06,1,1
3,0,0.70,0.20,0.62,0.41,0.41,0.10,0.80,0.52,0.82,...,0.52,0.25,0.07,0.23,0.04,0.76,0.41,0.59,1,3
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.41,0.16,0.03,0.21,0.10,0.12,0.53,0.30,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8297,20,0.91,0.76,0.45,0.81,0.99,0.20,0.36,0.28,0.02,...,0.92,0.86,0.84,0.86,0.52,0.04,0.77,0.68,0,0
8298,20,0.31,0.93,0.45,0.61,0.99,0.09,0.73,0.25,0.56,...,0.93,0.61,0.65,0.65,0.31,0.06,0.89,0.11,0,2
8299,20,0.92,0.82,0.52,0.61,0.94,0.67,0.65,0.34,0.25,...,0.74,0.94,0.76,0.61,0.72,0.14,0.11,0.91,0,4
8300,20,0.93,0.85,0.51,0.64,0.95,0.76,0.76,0.36,0.07,...,0.92,0.24,0.89,0.84,0.54,0.13,0.91,0.95,0,4


In [81]:
def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
    rf_max_leaves = trial.suggest_int('rf_max_leaves', 30, 40)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 10, 150, step=10)
    clf= lgb.LGBMClassifier(max_depth=rf_max_depth, num_leaves=rf_max_leaves, n_estimators=rf_n_estimators ,n_jobs=-1)

    accuracies = []
    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'target'], axis=1).reset_index(drop=True)
        train_y = train.query(f'fold!={i}').target.reset_index(drop=True)
        valid_x = train.query(f'fold=={i}').drop(['fold', 'target'], axis=1).reset_index(drop=True)
        valid_y = train.query(f'fold=={i}').target.reset_index(drop=True)
        clf.fit(train_x, train_y)
        probs = clf.predict_proba(valid_x)
        accuracies.append(accuracy_score(valid_y, probs.argmax(1)))

    return np.mean(accuracies)  # An objective value linked with the Trial object.

study = optuna.create_study(direction='maximize')  # Create a new study.
study.optimize(objective, n_trials=50)  # Invoke optimization of the objective function.

[32m[I 2021-04-14 14:55:43,721][0m A new study created in memory with name: no-name-e65243c4-37da-46f9-850c-a9221c9d7178[0m
[32m[I 2021-04-14 14:56:07,985][0m Trial 0 finished with value: 0.704168050891102 and parameters: {'rf_max_depth': 4, 'rf_max_leaves': 39, 'rf_n_estimators': 140}. Best is trial 0 with value: 0.704168050891102.[0m
[32m[I 2021-04-14 14:57:13,828][0m Trial 1 finished with value: 0.7136824238555667 and parameters: {'rf_max_depth': 17, 'rf_max_leaves': 36, 'rf_n_estimators': 120}. Best is trial 1 with value: 0.7136824238555667.[0m
[32m[I 2021-04-14 14:57:40,195][0m Trial 2 finished with value: 0.7113929770859476 and parameters: {'rf_max_depth': 6, 'rf_max_leaves': 40, 'rf_n_estimators': 80}. Best is trial 1 with value: 0.7136824238555667.[0m
[32m[I 2021-04-14 14:58:46,181][0m Trial 3 finished with value: 0.7148876783473448 and parameters: {'rf_max_depth': 21, 'rf_max_leaves': 32, 'rf_n_estimators': 140}. Best is trial 3 with value: 0.7148876783473448.[0

In [131]:
def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
    rf_max_leaves = trial.suggest_int('rf_max_leaves', 30, 40)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 10, 150, step=10)
    clf= lgb.LGBMClassifier(max_depth=rf_max_depth, num_leaves=rf_max_leaves, n_estimators=rf_n_estimators ,n_jobs=-1)

    
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    
    scoring = ['accuracy', 'neg_log_loss']
    scores = cross_validate(clf, X, y, cv=cv, n_jobs=-1, scoring=scoring)

    accuracy_score = scores['test_accuracy'].mean()

    return accuracy_score  # An objective value linked with the Trial object.

study = optuna.create_study(direction='maximize')  # Create a new study.
study.optimize(objective, n_trials=50)  # Invoke optimization of the objective function.

[32m[I 2021-04-13 22:23:20,288][0m A new study created in memory with name: no-name-9efd70a8-cf31-4cbc-b2f6-11637cc46433[0m
[32m[I 2021-04-13 22:23:36,673][0m Trial 0 finished with value: 0.7087449134285487 and parameters: {'rf_max_depth': 28, 'rf_max_leaves': 40, 'rf_n_estimators': 30}. Best is trial 0 with value: 0.7087449134285487.[0m
[33m[W 2021-04-13 22:24:05,970][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2021-04-13 22:24:37,593][0m Trial 2 finished with value: 0.7158518964479229 and parameters: {'rf_max_depth': 23, 'rf_max_leaves': 34, 'rf_n_estimators': 80}. Best is trial 2 with value: 0.7158518964479229.[0m
[32m[I 2021-04-13 22:24:51,402][0m Trial 3 finished with value: 0.7136838745711321 and parameters: {'rf_max_depth': 16, 'rf_max_leaves': 39, 'rf_n_estimators': 30}. Best is trial 2 with value: 0.7158518964479229.[0m
[32m[I 2021-04-13 22:25:31,655][0m Trial 4 finished with value: 0.7176580373269115 and parameters: {'rf_max_dept

In [16]:
X

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v968,v969,v970,v971,v972,v973,v974,v975,v976,v977
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,1.00,0.77,0.57,0.56,0.40,0.75,0.08,0.14,0.43,0.88
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.39,0.07,0.59,0.06,0.17,0.09,0.04,0.04,0.45,0.14
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.65,0.24,0.42,0.57,0.58,0.37,0.01,0.10,0.38,0.06
3,0,0.70,0.20,0.62,0.41,0.41,0.10,0.80,0.52,0.82,...,0.96,0.04,0.52,0.25,0.07,0.23,0.04,0.76,0.41,0.59
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.99,0.81,0.41,0.16,0.03,0.21,0.10,0.12,0.53,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8297,20,0.91,0.76,0.45,0.81,0.99,0.20,0.36,0.28,0.02,...,0.81,0.75,0.92,0.86,0.84,0.86,0.52,0.04,0.77,0.68
8298,20,0.31,0.93,0.45,0.61,0.99,0.09,0.73,0.25,0.56,...,0.53,0.36,0.93,0.61,0.65,0.65,0.31,0.06,0.89,0.11
8299,20,0.92,0.82,0.52,0.61,0.94,0.67,0.65,0.34,0.25,...,0.40,0.80,0.74,0.94,0.76,0.61,0.72,0.14,0.11,0.91
8300,20,0.93,0.85,0.51,0.64,0.95,0.76,0.76,0.36,0.07,...,0.71,0.33,0.92,0.24,0.89,0.84,0.54,0.13,0.91,0.95


In [48]:
test_X = test_X.drop('obs_id', axis=1)

In [49]:
def submit():
    # logic: train 5 folds and take the average of the probabilities
    PROBS = []
    clf= lgb.LGBMClassifier(max_depth = 12, num_leaves = 39, n_estimators = 140)
    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'target'], axis=1).reset_index(drop=True)
        train_y = train.query(f'fold!={i}').target.reset_index(drop=True)
        clf.fit(train_x, train_y)
        probs = clf.predict_proba(test_X)
        PROBS.append(probs)
    return np.array(PROBS).mean(0).argmax(1)  # 5 * probs


In [51]:
pred = submit()

In [19]:
#params = {'max_depth': 12, 'num_leaves': 39, 'n_estimators': 140, 'n_jobs' : -1}
clf= lgb.LGBMClassifier(max_depth = 12, num_leaves = 39, n_estimators = 140, n_jobs=-1)
clf.fit(X,y)
pred = clf.predict(test_X)


In [25]:
np.unique(pred)

array([0, 1, 2, 3, 4])

In [52]:
ss = pd.read_csv('sample_submission (1).csv')

In [53]:
ss['target'] = pred

In [54]:
ss.target.value_counts()

0    1195
1     757
2      78
4       9
3       2
Name: target, dtype: int64

In [55]:
ss.to_csv('submission_5fold.csv', index=False)

In [120]:
clf= lgb.LGBMClassifier(n_jobs=-1)
cv = KFold(n_splits=5, shuffle=True)
scoring = ['accuracy', 'neg_log_loss']
scores = cross_validate(clf, X, y, cv=cv, n_jobs=-1, scoring=scoring)

In [121]:
scores

{'fit_time': array([38.44742298, 38.27774382, 37.89593005, 37.49605989, 37.64017415]),
 'score_time': array([0.38174415, 0.38211513, 0.46092176, 0.52381229, 0.48865294]),
 'test_accuracy': array([0.70860927, 0.70860927, 0.70963855, 0.72951807, 0.7060241 ]),
 'test_neg_log_loss': array([-0.70949581, -0.7108665 , -0.71399456, -0.69770164, -0.76142708])}