In [1]:
import numpy as np 
import pandas as pd 


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.model_selection import train_test_split


# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'imblearn'

# 1. Exploratory Data Analysis

Let us load in the dataset via the trusty Pandas package into a dataframe object which we call **attrition** and have a quick look at the first few rows

In [None]:
attrition = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()

In [None]:
# Looking for NaN
attrition.isnull().any()

In [None]:
# attrition.Age.fillna('')

### Correlation of Features


In [None]:
attrition.corr()

#  Feature Engineering & Categorical Encoding

Task of Feature engineering and numerically encoding the categorical values in our dataset.

In [None]:
# attrition.shape

In [None]:
attrition.dtypes

In [None]:
# Empty list to store columns with categorical data
categorical = []
for col, value in attrition.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical = attrition.columns.difference(categorical)

In [None]:
numerical

In [None]:
categorical

In [None]:
# Store the categorical data in a dataframe called attrition_cat
attrition_cat = attrition[categorical]
attrition_cat = attrition_cat.drop(['Attrition'], axis=1) # Dropping the target column

In [None]:
attrition_cat

Applying the **get_dummies** method

In [None]:
# How can you convert categorial or string or object data into Numerical Format ?

# Process of converting your cat data into numerical format - Encoding process 

# Encoding (15 More )

# Label Encoding 

# One Hot Encoding ( OHE)

# Cat_A 

# Male
#Female 
#Male
#Female
# Prefer_not_to_say
# Male 

# OHE 

           # Cat_A_Male    #Cat_A_Female   #Cat_A_Prefer_not_to_say
#1# Male      1             0                0 
#2#Female     0             1                0
#3#Male       1             0                0
#4#Female     0             1                0
#5# Prefer_not_to_say 0     0                1
#6# Male 



# Label Encoding 

# Cat_A 

# Male   2       
#Female 1
#Male 2
#Female 1
# Prefer_not_to_say 3
# Male 2

# Target Encoding 
# Mean Encoding 































In [None]:
# Filter your object datatypes 

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
le.transform(["tokyo", "tokyo", "paris","amsterdam"])

# list(le.classes_)


#0 ,1,2

In [None]:
attrition_cat = pd.get_dummies(attrition_cat)
attrition_cat.head(3)

In [None]:
# Store the numerical features to a dataframe attrition_num
attrition_num = attrition[numerical]

let's concat numerical and caterogial dfs

In [None]:
# Concat the two dataframes together columnwise
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)

In [None]:
attrition_final.shape

**Target variable**

The target in this case is given by the column **Attrition** which contains categorical variables therefore requires numerical encoding. We numerically encode it by creating a dictionary with the mapping given as 1 : Yes and 0 : No

In [None]:
# Define a dictionary for the target mapping
target_map = {'Yes':1, 'No':0}
# Use the pandas apply method to numerically encode our attrition target variable
target = attrition["Attrition"].apply(lambda x: target_map[x])
target.head(3)


**Splitting Data into Train and Test sets**


In [None]:
# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_test = train_test_split(attrition_final, target, train_size= 0.75,random_state=0);

#  Implementing Machine Learning Models


## GBM Classifier



### 1.n_estimators - No of Trees in the Model

### 2.max_features - The number of features to consider while searching for a best split.Thumb Rule to have Square root of no of Columns

### 3.max_depth - Maximum Depth of Tree and can be used to control overfiting 

### 4.min_samples_leaf - Minimum samples (or observations) required in a terminal node or leaf.In general we need to have lower values  for it for Imbalanced problems

### 5.subsample- The fraction of samples to be used for fitting the individual base learners

### 6.learning_rate - Learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators

In [None]:
gb = GradientBoostingClassifier(random_state=100) # default 
gb.get_params()

In [None]:
# Fit the model to our train and target
gb.fit(train, target_train)
# Get our predictions
gb_predictions = gb.predict(test)

In [None]:
gb_predictions_prob = gb.predict_proba(test)
gb_predictions_prob

In [None]:
# Gradient Boosting Parameters
# gb_params ={
#     'n_estimators': 500,   # no of Trees 
#     'learning_rate' : 0.2,
#     'max_depth': 11,
#     'min_samples_leaf': 2,
#     'subsample': 1,
#     'max_features' : 'sqrt',
#     'random_state' : 100,
#     'verbose': 0
# }

#gb = GradientBoostingClassifier(**gb_params) # After Doing HPT , we can pass the paramaters

In [None]:
accuracy_score(target_test, gb_predictions)

### Feature Importance Gradient Boosting Model


In [None]:
gb.feature_importances_

In [None]:
# Scatter plot 
trace = go.Scatter(
    y = gb.feature_importances_,
    x = attrition_final.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1.3,
        size = 12,
        color = gb.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = attrition_final.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'GBM Model Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')