In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import joblib
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier


def main():
  # data load
  df = pd.read_csv('./data/'+ file_model + '.csv', header=0,dtype=my_dtype)
  ID = df.iloc[:,0] 
  y = df.iloc[:,1]
  X = df.iloc[:,2:]

  # preprocessing-1: one-hot encoding
  X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_cols)
  X_ohe = X_ohe.dropna(axis=1, how='all')
  X_ohe_columns = X_ohe.columns.values

  # preprocessing-2: null imputation
  imp = SimpleImputer()
  imp.fit(X_ohe)
  X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
  print(X_ohe.shape)

  # preprocessing-3: feature selection
  selector = RFECV(estimator=RandomForestClassifier(random_state=0), step=1)
  selector.fit(X_ohe, y)
  X_ohe_selected = selector.transform(X_ohe)
  X_ohe_selected = pd.DataFrame(X_ohe_selected, columns=X_ohe_columns[selector.support_])
  print(X_ohe_selected.shape)
  print(display(X_ohe_selected.head()))

  # preprocessing-4: preprocessing of a score data along with a model dataset
  if len(file_score)>0:
      # load score data
      dfs = pd.read_csv('./data/'+ file_score + '.csv', header=0,dtype=my_dtype)
      IDs = dfs.iloc[:,[0]] 
      Xs = dfs.iloc[:,2:]
      Xs_ohe = pd.get_dummies(Xs, dummy_na=True, columns=ohe_cols)
      Xs_ohe = Xs_ohe.dropna(axis=1, how='all')
      cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)

      # consistent with columns set
      Xs_exp = pd.concat([cols_m, Xs_ohe])
      Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))] = \
         Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))].fillna(0, axis=1)
      Xs_exp = Xs_exp.drop(list(set(Xs_ohe.columns.values)-set(X_ohe_columns)), axis=1)

      # re-order the score data columns
      Xs_exp = Xs_exp.reindex(X_ohe_columns, axis=1)
      Xs_exp = pd.DataFrame(imp.transform(Xs_exp), columns=X_ohe_columns)
      Xs_exp_selected = Xs_exp.loc[:, X_ohe_columns[selector.support_]]
      print(Xs_exp_selected.shape)
      print(display(Xs_exp_selected.head()))

  # modeling
  results_list =[]
  for model_name, clf in models.items():
    clf.fit(X_ohe_selected, y)
    joblib.dump(clf, './model/'+ model_name + '.pkl')
    results = cross_val_score(clf, X_ohe_selected, y, scoring='roc_auc', cv=5) 
    results_list.append((model_name, np.average(results),'+-', np.std(results)))
   
  results_list = sorted(results_list, key=lambda x:x[:]) 
  print(results_list)
  
  # scoring
  if len(file_score)>0: 
    for model_name, clf in models.items():
      score = pd.DataFrame(clf.predict_proba(Xs_exp_selected)[:,1], columns=['pred_score'])
      IDs.join(score).to_csv('./data/'+  model_name + '_' + file_score + '_with_pred.csv', index=False)

  
if __name__ == '__main__':

  # SET PARAMETERS
  file_model = 'final_hr_analysis_train'
  file_score = 'final_hr_analysis_test'
  ohe_cols = ['sales','salary']
  my_dtype = {'sales':object,
            'salary':object}
  
  models = {
      'mlp':Pipeline([('scl',StandardScaler()),('est',MLPClassifier(hidden_layer_sizes=(3,3),max_iter=1000,random_state=1))]),
      'gb': Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(random_state=1))]),
      'rf': Pipeline([('scl',StandardScaler()),('est',RandomForestClassifier(random_state=1))]),
      'xgb': Pipeline([('scl',StandardScaler()),('est',xgb.XGBClassifier(random_state=1))]),
      'lgb': Pipeline([('scl',StandardScaler()),('est',lgb.LGBMClassifier(random_state=1))]),
      'cat': Pipeline([('scl',StandardScaler()),('est',CatBoostClassifier(random_state=1,verbose=False))])
  }

In [2]:
main()

(10499, 22)
(10499, 5)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
0,0.53,0.52,2.0,135.0,4.0
1,0.77,0.53,5.0,256.0,3.0
2,0.89,0.79,3.0,149.0,2.0
3,0.64,0.63,3.0,156.0,6.0
4,0.98,0.74,4.0,151.0,3.0


None
(4500, 5)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
0,0.44,0.57,2.0,141.0,3.0
1,0.55,0.96,3.0,194.0,3.0
2,0.72,0.67,5.0,210.0,2.0
3,0.96,0.75,4.0,177.0,2.0
4,0.96,0.54,3.0,198.0,3.0


None
[('cat', 0.9924031893656355, '+-', 0.002485584534438711), ('gb', 0.9866723007563565, '+-', 0.002401204140876155), ('lgb', 0.9932278176683669, '+-', 0.0021654516781375237), ('mlp', 0.9727589140660701, '+-', 0.003461216288603106), ('rf', 0.9916344070928951, '+-', 0.0026205418703900336), ('xgb', 0.9864041851176424, '+-', 0.002853059489291966)]
