# import libraries

In [None]:
import numpy as np
import pandas as pd
import numpy as np
from itertools import cycle
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_curve, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import ADASYN

# load training data and check the top 5 features

In [None]:
data = pd.read_csv('train.csv')
data.head(5)

# check data info

In [None]:
data.info()

# label encode all categorical columns and fill missing values

In [None]:
le = LabelEncoder()
data["Division"] = le.fit_transform(data["Division"])
data["Gender"] = le.fit_transform(data["Gender"])
data["Channel_of_Recruitment"] = le.fit_transform(data["Channel_of_Recruitment"])
data["Foreign_schooled"] = le.fit_transform(data["Foreign_schooled"])
data["Marital_Status"] = le.fit_transform(data["Marital_Status"])
data["Past_Disciplinary_Action"] = le.fit_transform(data["Past_Disciplinary_Action"])
data["State_Of_Origin"] = le.fit_transform(data["State_Of_Origin"])
data["Previous_IntraDepartmental_Movement"] = le.fit_transform(data["Previous_IntraDepartmental_Movement"])
data['No_of_previous_employers'] = data['No_of_previous_employers'].replace('More than 5', 6)
data['No_of_previous_employers'] = pd.to_numeric(data['No_of_previous_employers'])
data['Qualification'] = data['Qualification'].fillna('First Degree or HND')
data['Qualification'] = le.fit_transform(data['Qualification'])
data.drop(['EmployeeNo'], axis=1, inplace=True)
data.head(5)

# separate the training data into dependent and independent variables

In [None]:
y = data['Promoted_or_Not']
X = data.drop('Promoted_or_Not', axis=1)

# check the class distribution of the dependent variable

In [None]:
# get class distribution
print ("Not Promoted:", data['Promoted_or_Not'][data['Promoted_or_Not'] == 0].count()) #class = 0
print ("Promoted:", data['Promoted_or_Not'][data['Promoted_or_Not'] == 1].count()) #class = 1

# since the independent variable class is unbalanced, balance it and split into train and test

In [None]:
# Oversampling the dataset
sm = ADASYN(random_state=1)
X, y = sm.fit_sample(X, y)
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# fit the train data into an xgboost classifier

In [None]:
params = {'min_child_weight':[4,5], 
          'nthread':[4],
          'gamma':[i/10.0 for i in range(3,6)],  
          'learning_rate': [.03, 0.05, .07, 0.1],
          'subsample':[i/10.0 for i in range(6,11)],
          'min_child_weight': [4],
          'silent': [1],
          'colsample_bytree':[i/10.0 for i in range(6,11)], 
          'max_depth': [2,3,4],
          'n_estimators': [1000]}

XGB_model = XGBClassifier(objective ='binary:hinge', 
                                         silent=False,
                                         scale_pos_weight=1,
                                         learning_rate = 0.01,
                                         subsample = 1.0,
                                         n_estimators=2000, 
                                         max_depth=16, 
                                         booster = 'dart',
                                         colsample_bytree = 0.6,
                                         reg_alpha = 0.28,
                                         gamma=0.4,
                                         verbosity=3)
XGB_model = RandomizedSearchCV(XGB_model, params, n_jobs=1,n_iter=10)
XGB_model.fit(X_train, y_train)

# use the xgboost model to predict on the test evaluation set and calculate f1 score

In [None]:
XGB_predictions = XGB_model.predict(X_test)
XGB_predictions = [round(value) for value in XGB_predictions]
score = f1_score(y_test, XGB_predictions)
print(score)

# load and predict using the test dataset provided and write predictions to csv file

In [None]:
test_id = pd.read_csv('test.csv')
test = pd.read_csv('test.csv')

test["Division"] = le.fit_transform(test["Division"])
test["Gender"] = le.fit_transform(test["Gender"])
test["Channel_of_Recruitment"] = le.fit_transform(test["Channel_of_Recruitment"])
test["Foreign_schooled"] = le.fit_transform(test["Foreign_schooled"])
test["Marital_Status"] = le.fit_transform(test["Marital_Status"])
test["Past_Disciplinary_Action"] = le.fit_transform(test["Past_Disciplinary_Action"])
test["State_Of_Origin"] = le.fit_transform(test["State_Of_Origin"])
test['No_of_previous_employers'] = test['No_of_previous_employers'].replace('More than 5', 6)
test["Previous_IntraDepartmental_Movement"] = le.fit_transform(test["Previous_IntraDepartmental_Movement"])
test['No_of_previous_employers'] = pd.to_numeric(test['No_of_previous_employers'])
test['Qualification'] = test['Qualification'].fillna('First Degree or HND')
test['Qualification'] = le.fit_transform(test['Qualification'])
test.drop(['EmployeeNo'], axis=1, inplace=True)

pred = grid.predict(test)
pred = [int(round(value)) for value in pred]

output = pd.DataFrame({'EmployeeNo': test_id.EmployeeNo, 'Promoted_or_Not': pred})
output.to_csv('submission.csv', index=False)