In [1]:
import numpy as np
import pandas as pd
import numpy as np
from itertools import cycle
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_curve, f1_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer,StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator,TransformerMixin

In [2]:
#load training data
data = pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [3]:
#split training dataset into train and test
y = data['Promoted_or_Not']
X = data.drop('Promoted_or_Not', axis=1)

In [4]:
# manually curated mapping for categorical variables, these values were derived from my intuition and domain knowledge
qualification_to_number={'First Degree or HND':2, 'MSc, MBA and PhD':3, 'Non-University Education':1}
genderMap={"Male":1.1,"Female":3.5}
yesnoMap={"Yes":1,"No":0}
noyesMap={"Yes":0,"No":1}
states_to_tribe={
    'ABIA': "IGBO",
    'ADAMAWA':"HAUSA",
    'AKWA IBOM':"IGBO",
    'ANAMBRA':"IGBO",
    'BAUCHI':"IGBO",
    'BAYELSA':"IGBO",
    'BENUE':"HAUSA",
    'BORNO':"HAUSA",
    'CROSS RIVER':"IGBO",
    'DELTA':"IGBO",
    'EBONYI':"IGBO",
    'EDO':"IGBO",
    'EKITI':"YORUBA",
    'ENUGU':"IGBO",
    'FCT':"HAUSA",
    'GOMBE':"HAUSA",
    'IMO':"IGBO",
    'JIGAWA':"HAUSA",
    'KADUNA':"HAUSA",
    'KANO':'HAUSA',
    'KATSINA':"HAUSA",
    'KEBBI':"HAUSA",
    'KOGI':"HAUSA",
    'KWARA':"HAUSA",
    "LAGOS":"YORUBA",
    'NASSARAWA':"HAUSA",
    'NIGER':"HAUSA",
     'OGUN':"YORUBA",
     'ONDO':"YORUBA",
     'OSUN':"YORUBA",
     'OYO':"YORUBA",
     'PLATEAU':"HAUSA",
     'RIVERS':"IGBO",
     'SOKOTO':"HAUSA",
     'TARABA':"HAUSA",
     'YOBE':"HAUSA",
     'ZAMFARA':"HAUSA"
    }
    
dept_to_number={
    "Commercial Sales and Marketing":50.0,
    "Customer Support and Field Operations":31.4,
    "Business Finance Operations":30.7,
    "People/HR Management":30.4,
    "Sourcing and Purchasing":19.5,
    "Regulatory and Legal services":18.0,
    "Information Technology and Solution Support":7.3,
    "Information and Strategy":5.1,
    "Research and Innovation":4.3
}
maritalMap={"Married":1,"Single":0.5}

def map_employers(x):
    try: 
        return int(x)
    except:
        return 5

In [5]:
# create a mappinng function which returns a dictionary mapping of the average values of that category which are promoted
# this was developed out of sheer desperation to try and boost my score :/

def mapCategoricalToAverage(name):
    mapping=dict(data.groupby(name)['Promoted_or_Not'].mean())
    return lambda x: mapping[x]

In [6]:
# specify the numeric columns we dont want to transform
numeric_columns=['Trainings_Attended', 'Year_of_birth', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average']
class myTransformer():
    def __init__(self):
        self.division_binarizer=LabelBinarizer()
        self.channel_binariser=LabelBinarizer()
        self.tribe_binariser=LabelBinarizer()
        self.dataScaler=StandardScaler()
    
    # Fit all the binarisers on the training data
    def fit(self,input_data):
        self.division_binarizer=self.division_binarizer.fit(input_data['Division'])
        self.channel_binariser=self.channel_binariser.fit(input_data['Channel_of_Recruitment'])
        self.tribe_binariser=self.tribe_binariser.fit(input_data['State_Of_Origin'])
        
    # Transform the input data using the fitted binarisers
    def transform(self,full_dataset,train=False):
        #     making a copy of the input because we dont want to change the input in the main function
        input_data=full_dataset.copy()
        # label binarise the dvision
        division_binarised=self.division_binarizer.transform(input_data['Division'])

        # categorise the qualifications
        input_data['Qualification']=input_data['Qualification'].fillna("Non-University Education")
        qualification_num=input_data['Qualification'].map(mapCategoricalToAverage("Qualification")).values

        # categorise the gender
        gender_num=input_data["Gender"].map(mapCategoricalToAverage("Gender")).values

        # binarise the channel
        channel_binarised=self.channel_binariser.transform(input_data['Channel_of_Recruitment'])

        # map state of origin to tribe and binarise it
        state_binarised=self.tribe_binariser.transform(input_data['State_Of_Origin'])

        # map foreign schooled
        foreign_schooled_num=input_data['Foreign_schooled'].map(mapCategoricalToAverage("Foreign_schooled")).values

        # map marital status
#         marital_status_num=input_data['Marital_Status'].map(lambda x: "Single" if x=="Not_Sure" else x).map(maritalMap).values

        # map past disciplinary actions
        past_discipline_num=input_data['Past_Disciplinary_Action'].map(mapCategoricalToAverage("Past_Disciplinary_Action")).values

        # map interdep movement
        interdep_movement_num=input_data['Previous_IntraDepartmental_Movement'].map(mapCategoricalToAverage("Previous_IntraDepartmental_Movement")).values

        # map employer
        previous_employer_count=input_data['No_of_previous_employers'].map(mapCategoricalToAverage("No_of_previous_employers")).values

        numeric_data=input_data[numeric_columns].values
        
        # Create new variables
        qualification_times_scoreavg=(qualification_num*input_data['Training_score_average']).values
        department_times_scoreavg=(input_data['Division'].map(dept_to_number) * input_data['Training_score_average']).values
        department_in_number=input_data['Division'].map(dept_to_number)

        
        # this concatenates all the data
        fully_transformed=np.c_[qualification_times_scoreavg,department_times_scoreavg,department_in_number,division_binarised,qualification_num,gender_num,channel_binarised,state_binarised,foreign_schooled_num,past_discipline_num,interdep_movement_num,previous_employer_count,numeric_data]
#         fully_transformed=np.c_[division_binarised,qualification_num,gender_num,channel_binarised,state_binarised,foreign_schooled_num,marital_status_num,past_discipline_num,interdep_movement_num,previous_employer_count,numeric_data]
        return fully_transformed

In [7]:
transformer=myTransformer()
transformer.fit(X)

In [8]:
transformed_x_train=transformer.transform(X,train=True)

In [9]:
transformed_x_train.shape

(38312, 65)

In [10]:
# Oversampling the dataset
sm = ADASYN(random_state=1)
X, y = sm.fit_sample(transformed_x_train, y)
    
# split into train and test/validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# X = SelectKBest(chi2, k=40).fit_transform(X, y)

In [12]:
pd.value_counts(y)

1    35490
0    35071
dtype: int64

In [13]:
#load and predict using the test dataset provided
test_id = pd.read_csv('test.csv')
test = pd.read_csv('test.csv')
test.drop(['EmployeeNo'], axis = 1, inplace = True)

transformed_x_test=transformer.transform(test,train=True)

In [None]:
import lightgbm as lgb

gridParams = {
          'num_leaves': list(range(8, 92, 4)),
          'min_data_in_leaf': [10, 20, 40, 60, 100],
          'bagging_freq': [3, 4, 5, 6, 7],
          'min_child_weight':[4,5,6],  
          'learning_rate': [.03, 0.05, .07, 0.01, 0.005],
          'subsample':[i/10.0 for i in range(6,11)],
          'min_child_weight': [4,6],
          'silent': [1],
          'boosting_type': ['gbdt','dart'],
          'colsample_bytree':[i/10.0 for i in range(6,11)], 
          'max_depth': [3, 4, 5, 6, 8, 12, 16, -1],
          'bagging_fraction': np.linspace(0.6, 0.95, 10),
          'reg_alpha': np.linspace(0.1, 0.95, 10),
          'reg_lambda': np.linspace(0.1, 0.95, 10)
    }
#LGBMRegressor
gbm0 = lgb.LGBMClassifier(n_estimators=5000, verbosity=-1, metric='mae')
gbm0 = RandomizedSearchCV(gbm0, gridParams,cv=4,n_jobs=-1)

gbm0.fit(X, y, verbose=True)
print(gbm0.best_params_)

In [None]:
pred = gbm0.predict(transformed_x_test)
pred = [int(round(value)) for value in pred]

output = pd.DataFrame({'EmployeeNo': test_id.EmployeeNo, 'Promoted_or_Not': pred})
output.to_csv('staffCVsubmission1.csv', index=False)

In [None]:
# A parameter grid for XGBoost
params = {
          'min_child_weight':[4,5,6], 
          'nthread':[5,6,7],
          'gamma':[i/10.0 for i in range(3,6)],  
          'learning_rate': [.03, 0.05, .07],
          'subsample':[i/10.0 for i in range(6,11)],
          'min_child_weight': [4,6],
          'silent': [1],
          'booster': ['dart'],
          'colsample_bytree':[i/10.0 for i in range(6,11)], 
          'max_depth': [2,3,4],
          'n_estimators': [5000],
          'colsample_bytree':[i/10.0 for i in range(6,11)]
        }

XGB_model = XGBRegressor(nthread=-1,objective='binary:logistic',missing=None,seed=8)
XGB_model = RandomizedSearchCV(XGB_model, params, n_jobs=-1,n_iter=10)
XGB_model.fit(X, y, verbose=True)
print(XGB_model.best_params_)

In [None]:
pred = XGB_model.predict(transformed_x_test)
pred = [int(round(value)) for value in pred]

output = pd.DataFrame({'EmployeeNo': test_id.EmployeeNo, 'Promoted_or_Not': pred})
output.to_csv('staffCVsubmission.csv', index=False)