In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

import lightgbm as lgb

from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
features = train.columns.drop(['id','Target'])

X_test = test[features]
X = train[features]
y = train['Target']

In [5]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
params = {'n_estimators': 8000, 
          'num_class': 3,
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'metric': 'multi_logloss',
          'verbosity': -1,
          'random_state': 42, 
          'reg_alpha': 2.1878527151970849, 
          'reg_lambda': 2.991543710164331, 
          'colsample_bytree': 0.5, 
          'subsample': 0.5, 
          'learning_rate': 0.02, 
          'max_depth': 15, 
          'num_leaves': 60, 
          'min_child_samples': 30, 
          'min_data_per_groups': 15
         }

In [7]:
def cross_val_train(X, y, test, params):
    
    spl=10   # Number of folders
    
    # Inititate arrays with predictions and oof predictions
    test_preds = np.zeros((len(test),3))
    val_preds = np.zeros((len(X),3))
    val_scores, train_scores = [],[]
    
    # perform cross-validation split
    cv = KFold(spl, shuffle=True, random_state=42)
    
    # "for" cycle to train for each fold
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X,y)):
        
        # divide train and validation data
        X_train = X.iloc[train_ind]
        y_train = y[train_ind]
        X_val = X.iloc[valid_ind]
        y_val = y[valid_ind]
        
        # Initiate model lightGBM 
        model = lgb.LGBMClassifier(**params)
        
        # fit the model
        model.fit(X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    callbacks=[lgb.early_stopping(stopping_rounds=70), lgb.log_evaluation(100)]) 
        
        # predictions on train and validation data
        y_pred_trn = model.predict_proba(X_train)
        y_pred_val = model.predict_proba(X_val)
        
        # compute accuracy
        print(X_train.shape)
        print(y_train.shape)
        print(y_pred_trn.shape)
        print(np.argmax(y_pred_trn, axis=1).shape)
        train_acc = accuracy_score(y_train, np.argmax(y_pred_trn, axis=1))
        val_acc = accuracy_score(y_val, np.argmax(y_pred_val, axis=1))
        
        # print partial results for the fold
        print("Fold:",fold, " Train R2:",np.round(train_acc,5), " Val R2:",np.round(val_acc,5))
        
        # compute test predictions and oof predictions
        test_preds += model.predict_proba(test[features])/spl
        val_preds[valid_ind] = model.predict_proba(X_val)
        val_scores.append(val_acc)
        print("-"*50)
        
    return val_scores, val_preds, test_preds

In [8]:
val_scores, val_preds, test_preds = cross_val_train(X, y, test, params)

Training until validation scores don't improve for 70 rounds
[100]	valid_0's multi_logloss: 0.488542
[200]	valid_0's multi_logloss: 0.446398
[300]	valid_0's multi_logloss: 0.436426
[400]	valid_0's multi_logloss: 0.433082
[500]	valid_0's multi_logloss: 0.431334
[600]	valid_0's multi_logloss: 0.430947
[700]	valid_0's multi_logloss: 0.430497
[800]	valid_0's multi_logloss: 0.43024
Early stopping, best iteration is:
[817]	valid_0's multi_logloss: 0.430187
(68866, 36)
(68866,)
(68866, 3)
(68866,)
Fold: 0  Train R2: 0.8712  Val R2: 0.83338
--------------------------------------------------
Training until validation scores don't improve for 70 rounds
[100]	valid_0's multi_logloss: 0.478098
[200]	valid_0's multi_logloss: 0.433944
[300]	valid_0's multi_logloss: 0.42396
[400]	valid_0's multi_logloss: 0.420533
[500]	valid_0's multi_logloss: 0.419043
[600]	valid_0's multi_logloss: 0.418404
[700]	valid_0's multi_logloss: 0.418085
[800]	valid_0's multi_logloss: 0.417908
[900]	valid_0's multi_logloss:

In [9]:
val_preds_out = np.argmax(val_preds, axis=1)
accuracy_score(y, val_preds_out)

0.833411746255783

In [10]:
y_test=np.argmax(test_preds, axis=1)
y_test

array([0, 2, 2, ..., 0, 0, 0], dtype=int64)

In [11]:
y_test = label_encoder.inverse_transform(y_test)

In [12]:
submission = pd.read_csv("sample_submission.csv")
submission['Target'] = y_test
submission.to_csv('submission_second.csv',index=False)
submission

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout
