In [None]:
# for mathematical operations
import numpy as np
# for dataframe operations
import pandas as pd

# for data visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# for machine learning
import sklearn
import imblearn
from sklearn.model_selection import train_test_split

# setting up the size of the figures
from pylab import rcParams
plt.rcParams['figure.figsize'] = (16, 5)
# setting up the style of the plot
plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
# Mount Google Drive (if not already mounted)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import Dataset
df_fna = pd.read_csv('/content/drive/MyDrive/613 Capstone/Data_MissingValuesHandled.csv')
df_fna.head()

Unnamed: 0.1,Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [None]:
#create copy of dataset features - this is what we will make changes to (dropped target column, as well as columns unnamed and employee_id as we definitely won't use these as features affecting promotions)
df_features = df_fna.drop(['Unnamed: 0','employee_id','is_promoted'],axis=1)
target = df_fna.is_promoted
print(df_features.head())
print(target.head())

          department     region         education gender recruitment_channel  \
0  Sales & Marketing   region_7  Master's & above      f            sourcing   
1         Operations  region_22        Bachelor's      m               other   
2  Sales & Marketing  region_19        Bachelor's      m            sourcing   
3  Sales & Marketing  region_23        Bachelor's      m               other   
4         Technology  region_26        Bachelor's      m               other   

   no_of_trainings  age  previous_year_rating  length_of_service  \
0                1   35                   5.0                  8   
1                1   30                   5.0                  4   
2                1   34                   3.0                  7   
3                2   39                   1.0                 10   
4                1   45                   3.0                  2   

   KPIs_met >80%  awards_won?  avg_training_score  
0              1            0                  49  
1     

Split data into train and test sets

In [None]:
x_train,x_test,y_train, y_test = train_test_split(df_features,target,test_size=0.3,random_state=43)

##Write function to drop unwanted features. This function will take the feature dataframe and a list of features as a parameter and return the updated feature dataframe with those specified features dropped (carry this out with both train and test feature sets)

In [None]:
def drop_features(df, features):
    df = df.drop(features, axis=1)
    return df


example how to call/use function

In [None]:
#df_features = drop_features(df_features,['region','age','awards_won?'])
#df_features.head()

## Write function to use label encoder on features. This function will take the feature dataframe and a list of features as parameters, and return the updated feature dataframe with those specified features label encoded

In [None]:
def encode_labels_train(df, features):
  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder()
  for feature in features:
    df[feature]= le.fit_transform(df[feature])
  return df

## Adjusted function to take train and test feature dataframes, as well as list of features and return the transformed label encoded feature dataframes for both train and test data

In [None]:
def encode_labels(df_train, df_test, features):
  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder()
  for feature in features:
    df_train[feature] = le.fit_transform(df_train[feature])
    df_test[feature] = le.transform(df_test[feature])
  return df_train, df_test

In [None]:
train_features, test_features = encode_labels(x_train,x_test,['department','region','gender','recruitment_channel'])
train_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
5057,7,17,Bachelor's,1,2,1,42,3.0,14,0,0,48
37770,0,11,Master's & above,1,2,1,43,5.0,14,1,0,87
49117,7,25,Bachelor's,0,0,1,27,3.0,5,0,0,49
48004,4,28,Bachelor's,1,2,1,45,4.0,2,1,0,61
5959,2,11,Bachelor's,1,2,1,34,3.0,4,1,0,56


In [None]:
test_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
34597,0,14,Bachelor's,0,0,1,28,3.0,1,1,0,86
29983,0,14,Bachelor's,1,1,1,30,3.0,1,0,0,86
49488,0,20,Master's & above,1,2,1,32,3.0,2,0,0,82
16047,7,31,Bachelor's,1,0,1,41,1.0,13,0,0,44
27879,7,14,Bachelor's,0,2,2,30,4.0,2,1,0,50


example how to call/use function

In [None]:
train_features,enc = encode_labels_train(x_train,['department','region','gender','recruitment_channel'])
train_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
5057,7,17,Bachelor's,1,2,1,42,3.0,14,0,0,48
37770,0,11,Master's & above,1,2,1,43,5.0,14,1,0,87
49117,7,25,Bachelor's,0,0,1,27,3.0,5,0,0,49
48004,4,28,Bachelor's,1,2,1,45,4.0,2,1,0,61
5959,2,11,Bachelor's,1,2,1,34,3.0,4,1,0,56


## Write function to use one hot encoder on features. This function will take the feature dataframe and a list of features as parameters, and return the updated feature dataframe with those features one hot encoded.

In [None]:
def encode_one_hot(df, features):
  df = pd.get_dummies(df, columns = features, drop_first = True)
  return df

example how to call/use function

In [None]:
#df_features = encode_one_hot(df_features, ['education','gender','recruitment_channel'])
#df_features.head()

## Edit function to take both train and test feature dataframes and a list of features as parameters, and return the updated train and test feature dataframes with those features one hot encoded.

In [None]:
def encode_one_hot(train,test,features):
  from sklearn.preprocessing import OneHotEncoder
  train_features = train[features]
  test_features = test[features]
  ohe = OneHotEncoder(sparse_output=False, drop=None, categories='auto', handle_unknown='error')
  train_ohe = ohe.fit_transform(train_features)
  cols_ohe = ohe.get_feature_names_out()
  train_ohe = pd.DataFrame(train_ohe, columns = cols_ohe)
  test_ohe = ohe.transform(test_features)
  test_ohe = pd.DataFrame(test_ohe, columns = cols_ohe)
  train = train.drop(features, axis=1).reset_index(drop=False)
  test = test.drop(features, axis=1).reset_index(drop=False)
  train = pd.concat([train,train_ohe],axis=1)
  train = train.set_index('index')
  test = pd.concat([test,test_ohe], axis=1)
  test = test.set_index('index')

  return train, test

In [None]:
train_features, test_features = encode_one_hot(x_train,x_test,['education','gender'])
train_features.head()

Unnamed: 0,department,region,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,education_Bachelor's,education_Below Secondary,education_Master's & above,gender_f,gender_m
5057,Sales & Marketing,region_25,sourcing,1.0,42.0,3.0,14.0,0.0,0.0,48.0,1.0,0.0,0.0,0.0,1.0
37770,Analytics,region_2,sourcing,1.0,43.0,5.0,14.0,1.0,0.0,87.0,0.0,0.0,1.0,0.0,1.0
49117,Sales & Marketing,region_32,other,1.0,27.0,3.0,5.0,0.0,0.0,49.0,,,,,
48004,Operations,region_4,sourcing,1.0,45.0,4.0,2.0,1.0,0.0,61.0,,,,,
5959,HR,region_2,sourcing,1.0,34.0,3.0,4.0,1.0,0.0,56.0,0.0,0.0,1.0,0.0,1.0


## Write function to bin feature data using KBinsDiscretizer. This function will take the feature dataframe, a list of the features to be binned, number of bins, encode type, and strategy as parameters, and return the updated feature dataframe with those features binned accordingly.

In [None]:
def bin_features(df, features, bins, encode_type, strategy):
  from sklearn.preprocessing import KBinsDiscretizer
  est = KBinsDiscretizer(n_bins = bins, encode = encode_type, strategy = strategy)
  est.fit(df[features])
  df[features] = est.transform(df[features])
  return df

example how to call/use function
*note possible choices for encode_type are ordinal, onehot, and onehot-dense.
*note possible choices for strategy are uniform, quantile, and kmeans

In [None]:
#df_features = bin_features(df_features, ['age','length_of_service'],10, 'ordinal', 'kmeans')
#df_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,3.0,5.0,2.0,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,2.0,5.0,0.0,0,0,60
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,3.0,3.0,1.0,0,0,50
3,Sales & Marketing,region_23,Bachelor's,m,other,2,4.0,1.0,2.0,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,6.0,3.0,0.0,0,0,73


## Edit function to take both train and test feature dataframes, a list of the features to be binned, number of bins, encode type, and strategy as parameters, and return the updated train and test feature dataframes with those features binned accordingly.

In [None]:
def bin_features(train, test, features, bins, encode_type, strategy):
  from sklearn.preprocessing import KBinsDiscretizer
  est = KBinsDiscretizer(n_bins = bins, encode = encode_type, strategy = strategy)
  est.fit(train[features])
  train[features] = est.transform(train[features])
  test[features] = est.transform(test[features])
  return train,test

In [None]:
train_features,test_features = bin_features(x_train,x_test, ['age','length_of_service'],10, 'ordinal', 'kmeans')
test_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
34597,Analytics,region_22,Bachelor's,f,other,1,1.0,3.0,0.0,1,0,86
29983,Analytics,region_22,Bachelor's,m,referred,1,2.0,3.0,0.0,0,0,86
49488,Analytics,region_28,Master's & above,m,sourcing,1,3.0,3.0,0.0,0,0,82
16047,Sales & Marketing,region_7,Bachelor's,m,other,1,5.0,1.0,3.0,0,0,44
27879,Sales & Marketing,region_22,Bachelor's,f,sourcing,2,2.0,4.0,0.0,1,0,50


##Write function to cap outliers at chosen value

*choices for method are lower, extreme_lower, upper, extreme_upper, upper_value, lower_value

In [None]:
def cap_outliers(df, feature, method, value=None):
  if method == 'upper_value':
    df.loc[df[feature] > value, feature]=value
  elif method == 'lower_value':
    df.loc[df[feature] < value, feature]=value
  else:
    IQR= df[feature].quantile(0.75) - df[feature].quantile(0.25)
    if method == 'lower':
      lower_bridge= df[feature].quantile(0.25)-(IQR*1.5)
      df.loc[df[feature]<lower_bridge, feature]=lower_bridge
    elif method == 'upper':
      upper_bridge= df[feature].quantile(0.75)+(IQR*1.5)
      df.loc[df[feature]>upper_bridge, feature]=upper_bridge
    elif method == 'extreme_lower':
      extreme_lower_bridge = df[feature].quantile(0.25)-(IQR*3)
      df.loc[df[feature]<extreme_lower_bridge, feature]=extreme_lower_bridge
    elif method == 'extreme_upper':
      extreme_upper_bridge = df[feature].quantile(0.75)+(IQR*3)
      df.loc[df[feature]>extreme_upper_bridge, feature]=extreme_upper_bridge
  return df


example how to call/use function

In [None]:
#df_features['length_of_service'].head()

0     8
1     4
2     7
3    10
4     2
Name: length_of_service, dtype: int64

In [None]:
#df_features['length_of_service'].value_counts().sort_index()

1     4547
2     6684
3     7033
4     6836
5     5832
6     4734
7     5551
8     2883
9     2629
10    2193
11     916
12     794
13     687
14     549
15     593
16     548
17     432
18     392
19     329
20     128
21      78
22      61
23      65
24      70
25      51
26      41
27      36
28      30
29      30
30      12
31      20
32      10
33       9
34       4
37       1
Name: length_of_service, dtype: int64

In [None]:
#df_features = cap_outliers(df_features, 'length_of_service', 'extreme_upper')

In [None]:
#df_features['length_of_service'].value_counts().sort_index()

1     4547
2     6684
3     7033
4     6836
5     5832
6     4734
7     5551
8     2883
9     2629
10    2193
11     916
12     794
13     687
14     549
15     593
16     548
17     432
18     392
19     975
Name: length_of_service, dtype: int64

In [None]:
#df_features['no_of_trainings'].value_counts().sort_index()

1     44378
2      7987
3      1776
4       468
5       128
6        44
7        12
8         5
9         5
10        5
Name: no_of_trainings, dtype: int64

In [None]:
#df_features = cap_outliers(df_features, 'no_of_trainings', 'upper_value', 5)

In [None]:
#df_features['no_of_trainings'].value_counts().sort_index()

1    44378
2     7987
3     1776
4      468
5      199
Name: no_of_trainings, dtype: int64

##Edit function to take train and test dataframes and cap

In [None]:
def cap_outliers(train, test, feature, method, value=None):
  if method == 'upper_value':
    train.loc[train[feature] > value, feature]=value
    test.loc[test[feature] > value, feature]=value
  elif method == 'lower_value':
    train.loc[train[feature] < value, feature]=value
    test.loc[train[feature] < value, feature]=value
  else:
    IQR= train[feature].quantile(0.75) - train[feature].quantile(0.25)
    if method == 'lower':
      lower_bridge= train[feature].quantile(0.25)-(IQR*1.5)
      train.loc[train[feature]<lower_bridge, feature]=lower_bridge
      test.loc[test[feature]<lower_bridge, feature]=lower_bridge
    elif method == 'upper':
      upper_bridge= train[feature].quantile(0.75)+(IQR*1.5)
      train.loc[train[feature]>upper_bridge, feature]=upper_bridge
      test.loc[train[feature]>upper_bridge, feature]=upper_bridge
    elif method == 'extreme_lower':
      extreme_lower_bridge = train[feature].quantile(0.25)-(IQR*3)
      train.loc[train[feature]<extreme_lower_bridge, feature]=extreme_lower_bridge
      test.loc[test[feature]<extreme_lower_bridge, feature]=extreme_lower_bridge
    elif method == 'extreme_upper':
      extreme_upper_bridge = train[feature].quantile(0.75)+(IQR*3)
      train.loc[train[feature]>extreme_upper_bridge, feature]=extreme_upper_bridge
      test.loc[test[feature]>extreme_upper_bridge, feature]=extreme_upper_bridge
  return train, test

In [None]:
train_features, test_features = cap_outliers(x_train, x_test, 'no_of_trainings', 'upper_value', 5)

In [None]:
train_features['no_of_trainings'].value_counts().sort_index()

1    31083
2     5559
3     1238
4      343
5      142
Name: no_of_trainings, dtype: int64

##Write Function to Normalize Features Using Standard Scaler. This function will take the feature dataframe and a list of features as parameters, and will return the feature dataframe with specified features normalized via standard scaler method.

In [None]:
def norm_standard(df, features):
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  model = scaler.fit(df[features])
  df[features] = model.transform(df[features])
  return df

example how to call/use function

In [None]:
#df_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73


In [None]:
#df_features = norm_standard(df_features, ['age','length_of_service'])
#df_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,0.025598,5.0,0.545103,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,-0.627135,5.0,-0.448933,0,0,60
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,-0.104948,3.0,0.296594,0,0,50
3,Sales & Marketing,region_23,Bachelor's,m,other,2,0.547785,1.0,1.042121,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,1.331064,3.0,-0.945951,0,0,73


##Edit Function to take both train and test feature dataframes and a list of features as parameters, and will return the train and test feature dataframes with specified features normalized via standard scaler method.

In [None]:
def norm_standard(train, test, features):
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  model = scaler.fit(train[features])
  train[features] = model.transform(train[features])
  test[features] = model.transform(test[features])
  return train, test

In [None]:
train_features, test_features = norm_standard(x_train, x_test, ['age','length_of_service'])
train_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
5057,Sales & Marketing,region_25,Bachelor's,m,sourcing,1,0.975474,3.0,1.570861,0,0,48
37770,Analytics,region_2,Master's & above,m,sourcing,1,0.975474,5.0,1.570861,1,0,87
49117,Sales & Marketing,region_32,Bachelor's,f,other,1,-1.119586,3.0,0.005691,0,0,49
48004,Operations,region_4,Bachelor's,m,sourcing,1,1.499239,4.0,-0.776894,1,0,61
5959,HR,region_2,Bachelor's,m,sourcing,1,-0.072056,3.0,-0.776894,1,0,56


##Write Function to Normalize Features Using MinMax Scaler. This function will take the feature dataframe and a list of features as parameters, and will return the feature dataframe with specified features normalized via minmax scaler method.

In [None]:
def norm_minmax(df, features):
  from sklearn.preprocessing import MinMaxScaler
  scaler = MinMaxScaler()
  model=scaler.fit(df[features])
  df[features]=model.transform(df[features])
  return df

example how to call/use function

In [None]:
#df_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73


In [None]:
#df_features = norm_minmax(df_features, ['age','length_of_service'])
#df_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,0.375,5.0,0.194444,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,0.25,5.0,0.083333,0,0,60
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,0.35,3.0,0.166667,0,0,50
3,Sales & Marketing,region_23,Bachelor's,m,other,2,0.475,1.0,0.25,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,0.625,3.0,0.027778,0,0,73


##Edit Function to take both train and test feature dataframes and a list of features as parameters, and will return the train and test feature dataframes with specified features normalized via minmax scaler method.

In [None]:
def norm_minmax(train, test, features):
  from sklearn.preprocessing import MinMaxScaler
  scaler = MinMaxScaler()
  model=scaler.fit(train[features])
  train[features]=model.transform(train[features])
  test[features]=model.transform(test[features])
  return train, test

In [None]:
train_features, test_features = norm_minmax(x_train, x_test, ['age','length_of_service'])
train_features.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
5057,Sales & Marketing,region_25,Bachelor's,m,sourcing,1,0.555556,3.0,0.333333,0,0,48
37770,Analytics,region_2,Master's & above,m,sourcing,1,0.555556,5.0,0.333333,1,0,87
49117,Sales & Marketing,region_32,Bachelor's,f,other,1,0.111111,3.0,0.111111,0,0,49
48004,Operations,region_4,Bachelor's,m,sourcing,1,0.666667,4.0,0.0,1,0,61
5959,HR,region_2,Bachelor's,m,sourcing,1,0.333333,3.0,0.0,1,0,56
