In [8]:
import numpy as np
import pandas as pd

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
#import category_encoders as ce
from sklearn.metrics import f1_score

import xgboost as xgb

In [9]:
df = pd.read_csv('../data/train.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38312 entries, 0 to 38311
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   EmployeeNo                           38312 non-null  object 
 1   Division                             38312 non-null  object 
 2   Qualification                        36633 non-null  object 
 3   Gender                               38312 non-null  object 
 4   Channel_of_Recruitment               38312 non-null  object 
 5   Trainings_Attended                   38312 non-null  int64  
 6   Year_of_birth                        38312 non-null  int64  
 7   Last_performance_score               38312 non-null  float64
 8   Year_of_recruitment                  38312 non-null  int64  
 9   Targets_met                          38312 non-null  int64  
 10  Previous_Award                       38312 non-null  int64  
 11  Training_score_average      

In [11]:
for col in df.columns:
    print(col, '\n' ,df[col].value_counts().sort_values(ascending=False) , '\n')

EmployeeNo 
 YAK/S/48710    1
YAK/S/01051    1
YAK/S/34540    1
YAK/S/08463    1
YAK/S/50374    1
              ..
YAK/S/33563    1
YAK/S/14064    1
YAK/S/04174    1
YAK/S/28459    1
YAK/S/26751    1
Name: EmployeeNo, Length: 38312, dtype: int64 

Division 
 Commercial Sales and Marketing                 11695
Customer Support and Field Operations           7973
Sourcing and Purchasing                         5052
Information Technology and Solution Support     4952
Information and Strategy                        3721
Business Finance Operations                     1786
People/HR Management                            1704
Regulatory and Legal services                    733
Research and Innovation                          696
Name: Division, dtype: int64 

Qualification 
 First Degree or HND         25578
MSc, MBA and PhD            10469
Non-University Education      586
Name: Qualification, dtype: int64 

Gender 
 Male      26880
Female    11432
Name: Gender, dtype: int64 

Channel_o

# cleaning
## droping unneeded columns  
droping EmployeeNo columns as it is unique value column  

In [12]:
df.drop('EmployeeNo', axis=1, inplace=True)

## handeling missing data
Qualification has 1679 missing values
will cast them as unknow = '0'

In [17]:
df.Qualification.isna().sum()

1679

In [18]:
df.loc[df.Qualification.isna(), 'Qualification'] = '0'

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38312 entries, 0 to 38311
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Division                             38312 non-null  object 
 1   Qualification                        38312 non-null  object 
 2   Gender                               38312 non-null  object 
 3   Channel_of_Recruitment               38312 non-null  object 
 4   Trainings_Attended                   38312 non-null  int64  
 5   Year_of_birth                        38312 non-null  int64  
 6   Last_performance_score               38312 non-null  float64
 7   Year_of_recruitment                  38312 non-null  int64  
 8   Targets_met                          38312 non-null  int64  
 9   Previous_Award                       38312 non-null  int64  
 10  Training_score_average               38312 non-null  int64  
 11  State_Of_Origin             

## handeling the boolians columns

In [20]:
# turing yes no columns to bool
for col in ['Previous_IntraDepartmental_Movement', 'Past_Disciplinary_Action', 'Foreign_schooled']:
    df[col] = df[col].str.replace('Yes', '1').str.replace('No', '0')
    df[col] = df[col].astype('int').astype('bool')

In [21]:
# truning thr trarget column to bool
df.Promoted_or_Not = df.Promoted_or_Not.astype('bool') 

In [None]:
df['Year_since_recruitment'] = max(df.Year_of_recruitment) - df.Year_of_recruitment + 1 

In [None]:
# resampling the data set accoring to the traget to gerentate a balanced dataset
df_no_promo = df.loc[df.Promoted_or_Not == 0]
df_yes_promo = df.loc[df.Promoted_or_Not == 1]

In [None]:
len(df_no_promo)

In [None]:
len(df_yes_promo)

In [None]:
to_blance_promo = resample(df_yes_promo, replace=True, n_samples=int(len(df_no_promo)), random_state=42)

In [None]:
len(to_blance_promo)

In [None]:
df_b = pd.concat([df_no_promo, to_blance_promo])
X = df_b.copy().drop(columns=['Year_of_recruitment','Promoted_or_Not', 'EmployeeNo', 'Division', 'Qualification', 'Channel_of_Recruitment', 'State_Of_Origin', 'Marital_Status', 'No_of_previous_employers', 'Gender'])
y = df_b.Promoted_or_Not

In [None]:
df_b.info()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [None]:
#random forest with gini
rf = RandomForestClassifier(criterion='gini',n_estimators=150,max_depth=4,n_jobs=-1)

rf.fit(X_train,y_train)

rf_predict = rf.predict(X_test)

f1_score(y_test, rf_predict, average='micro')

In [None]:
n_trees = [i for i in range(100,200)]
depths = [i for i in range(3,11)]
var_matrix = []
for i in depths:
    for j in n_trees:
        var_matrix.append((i, j))

In [None]:
best = [0, 0, 0]
for depth, n_tree in var_matrix:
    rf = RandomForestClassifier(criterion='gini', n_estimators=n_tree, max_depth=depth,n_jobs=-1)
    rf.fit(X_train,y_train)
    rf_predict = rf.predict(X_test)
    f_score = f1_score(y_test, rf_predict, average='micro')
    print(best[0], best[1], best[2])
    if f_score > best[2]:
        best[0], best[1], best[2] = depth, n_tree, f_score