In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.stats import randint, uniform 

from catboost import CatBoostClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier, ExtraTreesClassifier

from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score

In [None]:
train_err = pd.read_csv('./data1/train_err_data.csv')
train_qual = pd.read_csv('./data1/train_quality_data.csv')
train_prob = pd.read_csv('./data1/train_problem_data.csv')

test_err = pd.read_csv('./data1/test_err_data.csv')
test_qual = pd.read_csv('./data1/test_quality_data.csv')

sub = pd.read_csv('./data1/sample_submission.csv')

In [None]:
#train data
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000, 42))
prob = np.zeros(15000)

for person_idx, err in tqdm(id_error):
    error[person_idx - 10000, err-1] +=1

prob[train_prob.user_id.unique()-10000] = 1

np.save('./data/X_train.npy', error)
np.save('./data/y_train.npy', prob)

#test data
id_error = test_err[['user_id','errtype']].values
error = np.zeros((14999,42))
for person_idx, err in tqdm(id_error):
    error[person_idx - 30000,err - 1] += 1

np.save('./data/X_test.npy', error)

In [2]:
X = np.load('data/X_train.npy')
y = np.load('data/y_train.npy')
X_test = np.load('data/X_test.npy')

sub = pd.read_csv('./data1/sample_submission.csv')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [None]:
params = {"max_depth": randint(5, 15),
        "learning_rate": uniform(loc = 0.005, scale = 0.2),}

clf1 = CatBoostClassifier(task_type='GPU',
                        verbose=0,
                        n_estimators=200)
clf1 = RandomizedSearchCV(clf1, 
                        param_distributions=params, 
                        cv=5, 
                        scoring='roc_auc')
clf1.fit(X_train,y_train)

np.mean(cross_val_score(clf1, X_val, y_val, cv=5, scoring='roc_auc'))

In [None]:
params = {"max_depth": randint(10, 25),
        "learning_rate": uniform(loc = 0.005, scale = 0.2),
        "gamma": uniform(loc = 0., scale = 0.2),
        "subsample": uniform(loc = 0.4, scale = 0.8),
        "colsample_bytree": uniform(loc = 0.4, scale = 0.8),
        "min_child_weight": [3],
        "silent": [True]}

clf2 = XGBClassifier(eval_metric='auc',
                    tree_method='gpu_hist',
                    n_estimators=200)
clf2 = RandomizedSearchCV(clf2, 
                        param_distributions=params, 
                        cv=5, 
                        scoring='roc_auc')
clf2.fit(X_train, y_train)

np.mean(cross_val_score(clf2, X_val, y_val, cv=5, scoring='roc_auc'))

In [None]:
params = {"max_depth": randint(10, 25),
        "learning_rate": uniform(loc = 0.005, scale = 0.2),
        "gamma": uniform(loc = 0., scale = 0.2),
        "subsample": uniform(loc = 0.4, scale = 0.8),
        "colsample_bytree": uniform(loc = 0.4, scale = 0.8),
        "min_child_weight": [3],
        "silent": [True]}

clf3 = XGBRFClassifier(eval_metric='auc',
                    tree_method='gpu_hist',
                    n_estimators=200)
clf3 = RandomizedSearchCV(clf3, 
                        param_distributions=params, 
                        cv=5, 
                        scoring='roc_auc')
clf3.fit(X_train,y_train)

np.mean(cross_val_score(clf3, X_val, y_val, cv=5, scoring='roc_auc'))

In [None]:
clf = StackingClassifier(
    estimators=[('cat',clf1),('xg',clf2),('xgrf',clf3)],
    final_estimator=ExtraTreesClassifier())
clf.fit(X_train, y_train)

np.mean(cross_val_score(clf, X_val, y_val, scoring='roc_auc'))

In [None]:
y_pred = clf.predict_proba(test)
sub['problem'] = y_pred[:,1]
sub.to_csv('sub1.csv',index=False)