## Import basic libraries

In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

## Print the first five columns of training data set

In [2]:
df = pd.read_csv("train.csv")
df.head()


Unnamed: 0,Id,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Target
0,1,0,0,18,0,61405,3036,58000,24,3000,4493,12.8,1,2,4,1,0,10,21,7,11,18,0
1,2,0,0,22,1,51500,1943,37000,24,3075,1380,13.0,1,3,0,1,0,15,8,0,0,0,0
2,3,0,0,22,0,94000,2812,53900,24,3054,3100,12.6,1,3,0,1,0,1,2,0,0,0,0
3,4,0,1,12,2,58000,2780,37900,15,3065,20731,8.0,1,3,0,1,0,1,1,0,0,0,0
4,5,0,0,15,0,22125,1489,22125,18,3028,8481,14.09,1,3,0,0,0,0,0,0,0,0,0


In [3]:
df.shape

(89633, 23)

## Print information about each feature


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89633 entries, 0 to 89632
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          89633 non-null  int64  
 1   Feature 1   89633 non-null  int64  
 2   Feature 2   89633 non-null  int64  
 3   Feature 3   89633 non-null  int64  
 4   Feature 4   89633 non-null  int64  
 5   Feature 5   89633 non-null  int64  
 6   Feature 6   89633 non-null  int64  
 7   Feature 7   89633 non-null  int64  
 8   Feature 8   89633 non-null  int64  
 9   Feature 9   89633 non-null  int64  
 10  Feature 10  89633 non-null  int64  
 11  Feature 11  89633 non-null  float64
 12  Feature 12  89633 non-null  int64  
 13  Feature 13  89633 non-null  int64  
 14  Feature 14  89633 non-null  int64  
 15  Feature 15  89633 non-null  int64  
 16  Feature 16  89633 non-null  int64  
 17  Feature 17  89633 non-null  int64  
 18  Feature 18  89633 non-null  int64  
 19  Feature 19  89633 non-nul

In [5]:
df.describe()

Unnamed: 0,Id,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Target
count,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0,89633.0
mean,44817.0,0.028695,0.154809,17.218893,0.693796,51648.867939,2323.811509,39315.988007,21.060636,3025.109893,3510.933261,11.702281,0.865396,2.666853,0.634956,1.099316,0.0,3.018263,3.200172,0.594569,1.08517,1.752837,0.021655
std,25874.962676,0.166948,0.439177,5.200704,1.09643,15638.960173,575.396896,11036.768257,5.694622,23.716217,3749.66641,3.994955,0.341302,0.596338,1.368319,0.742674,0.0,6.518031,5.519076,2.037483,3.802031,6.271963,0.145555
min,1.0,0.0,0.0,2.0,0.0,0.0,681.0,10000.0,6.0,3000.0,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22409.0,0.0,0.0,14.0,0.0,38000.0,1893.0,30000.0,18.0,3007.0,1368.0,9.5,1.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,44817.0,0.0,0.0,17.0,0.0,54770.0,2315.0,39692.0,24.0,3019.0,2307.0,12.65,1.0,3.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0
75%,67225.0,0.0,0.0,21.0,1.0,61308.0,2690.0,48000.0,24.0,3038.0,3919.0,14.09,1.0,3.0,1.0,1.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0
max,89633.0,1.0,3.0,36.0,29.0,118968.0,7840.0,94900.0,36.0,3111.0,21106.0,20.65,1.0,4.0,36.0,13.0,0.0,551.0,189.0,158.0,314.0,481.0,1.0


## Drop Feature 16 since it only contains value zero 

In [6]:
df.drop(['Id', 'Feature 16'], axis=1, inplace=True)

## Drop all duplicates

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:
feature_cols = [col for col in df.columns if col not in ['Target']]
target_col = ['Target']

## Split training data set into train and test

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col], 
                                                    stratify=df[target_col], 
                                                    test_size=0.2, random_state=42, 
                                                    shuffle=True)


In [11]:
model1 = xgb.XGBClassifier(n_estimators=600, learning_rate=0.013185216741789522, 
                           subsample=0.9, colsample_bytree=0.8, 
                           reg_alpha=6, reg_lambda=5, max_depth=5, 
                           min_child_weight=184, scale_pos_weight=45)

In [12]:
model1.fit(X_train, y_train, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.013185216741789522, max_delta_step=0, max_depth=5,
              min_child_weight=184, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=6, reg_lambda=5, scale_pos_weight=45, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
print(f"Training ROC: {roc_auc_score(y_train, model1.predict_proba(X_train)[:, 1])}")
print(f"Validation ROC: {roc_auc_score(y_test, model1.predict_proba(X_test)[:, 1])}")

Training ROC: 0.9134383410393491
Validation ROC: 0.871786265750947


In [16]:
submission = pd.read_csv("sample_submission")
test = pd.read_csv("test.csv")

In [17]:
preds1 = model1.predict_proba(test[feature_cols])[:, 1]

In [18]:
submission['Target'] = preds1
submission.to_csv('submission.csv', index=False)