<a href="https://colab.research.google.com/github/MorleyB/assign03-50-bankston/blob/feature%2Fsplit-train-cross-validate/churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine Learning using scikit-learn.**

## Problem Formulation

Gain experience applying scikit learn to machine learning problems

In [2]:
#tables and visualizations
import pandas as pd
import numpy as np

#machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn import config_context

## Load Data

In [3]:
employees = pd.read_excel('https://raw.githubusercontent.com/morleyb/assign03-50-bankston/main/IBM-HR-Data-Employee-Attrition.xlsx')
display(employees.head())
employees.info()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

## Train and Split Data

In [7]:
class_column = 'Attrition'
random_seed = 2435

X_train, X_test, y_train, y_test = train_test_split(employees.drop(columns=class_column), employees[class_column], test_size=0.25, random_state=random_seed)

Quick sanity check to make sure that everything seems correct:

In [25]:
# X Train
print('On X train: ')
print('X train dimensions: ', X_train.shape)
display(X_train.head())

# X test
print('\nOn X test: ')
print('X test dimensions: ', X_test.shape)
display(X_test.head())

On X train: 
X train dimensions:  (1102, 35)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
474,24,No,Travel_Rarely,691,Research & Development,23,3,Medical,1,639,...,2,80,2,6,3,3,6,5,1,4
496,21,No,Travel_Rarely,1343,Sales,22,1,Technical Degree,1,669,...,3,80,0,3,2,3,3,2,1,2
556,53,No,Travel_Rarely,346,Research & Development,6,3,Life Sciences,1,769,...,4,80,0,19,4,3,2,2,2,2
212,27,No,Travel_Frequently,1242,Sales,20,3,Life Sciences,1,293,...,4,80,0,7,2,3,7,7,0,7
1316,43,No,Travel_Frequently,1422,Sales,2,4,Life Sciences,1,1849,...,3,80,1,7,5,3,7,7,7,7



On X test: 
X test dimensions:  (368, 35)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1400,38,No,Travel_Frequently,1444,Human Resources,1,4,Other,1,1972,...,2,80,1,7,2,3,6,2,1,2
234,33,Yes,Travel_Rarely,813,Research & Development,14,3,Medical,1,325,...,3,80,1,8,2,1,5,4,0,4
1221,45,No,Non-Travel,1238,Research & Development,1,1,Life Sciences,1,1712,...,4,80,1,25,3,2,23,15,14,4
189,51,No,Travel_Rarely,313,Research & Development,3,3,Medical,1,258,...,3,80,0,21,6,3,7,7,1,0
92,30,No,Travel_Rarely,1334,Sales,4,2,Medical,1,121,...,2,80,3,11,4,2,11,8,2,7


In [8]:
# Y Train
print('On y train: ')
print('X train dimensions: ', y_train.shape)
display(y_train.head())

# Y test
print('On y test: ')
print('X test dimensions: ', y_test.shape)
display(y_test.head())

On y train: 
X train dimensions:  (1102,)


474     No
496     No
556     No
212     No
1316    No
Name: Attrition, dtype: object

On y test: 
X test dimensions:  (368,)


1400     No
234     Yes
1221     No
189      No
92       No
Name: Attrition, dtype: object

## Establish the training pipeline

In [9]:
#individual pipelines for differing datatypes
cat_pipeline = Pipeline(steps=[('cat_impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ('onehot_cat', OneHotEncoder(drop='if_binary'))])
num_pipeline = Pipeline(steps=[('impute_num', SimpleImputer(missing_values=np.nan, strategy='mean')),
                               ('scale_num', StandardScaler())])

In [10]:
#establish preprocessing pipeline by columns
preproc = ColumnTransformer([('cat_pipe', cat_pipeline, make_column_selector(dtype_include=object)),
                             ('num_pipe', num_pipeline, make_column_selector(dtype_include=np.number))],
                             remainder='passthrough')

In [11]:
#generate the whole modeling pipeline with preprocessing
pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', LogisticRegression(penalty='elasticnet', solver='saga', tol=0.01))])

#visualization for steps
with config_context(display='diagram'):
    display(pipe)

## Cross-validation

In [12]:
tuning_grid = {'mdl__l1_ratio' : np.linspace(0,1,5),
               'mdl__C': np.logspace(-1, 6, 3) }
grid_search = GridSearchCV(pipe, param_grid = tuning_grid, cv = 5, return_train_score=True)

In [13]:
tuning_grid

{'mdl__C': array([1.00000000e-01, 3.16227766e+02, 1.00000000e+06]),
 'mdl__l1_ratio': array([0.  , 0.25, 0.5 , 0.75, 1.  ])}

In [14]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preproc',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat_pipe',
                                                                         Pipeline(steps=[('cat_impute',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot_cat',
                                                                                          OneHotEncoder(drop='if_binary'))]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f692db003d0>),
                                                                        ('num_pipe',
                                                    

In [15]:
print(grid_search.best_score_)
grid_search.best_params_

0.8711353352529823


{'mdl__C': 0.1, 'mdl__l1_ratio': 0.0}

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mdl__C,param_mdl__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.104666,0.013982,0.024295,0.01265,0.1,0.0,"{'mdl__C': 0.1, 'mdl__l1_ratio': 0.0}",0.864253,0.886878,0.872727,...,0.871135,0.008517,1,0.879682,0.884222,0.877551,0.884354,0.887755,0.882713,0.003639
1,0.145324,0.092388,0.020305,0.017713,0.1,0.25,"{'mdl__C': 0.1, 'mdl__l1_ratio': 0.25}",0.868778,0.882353,0.854545,...,0.864772,0.01035,2,0.871737,0.879682,0.868481,0.878685,0.876417,0.875,0.004258
2,0.073347,0.00767,0.013554,0.002354,0.1,0.5,"{'mdl__C': 0.1, 'mdl__l1_ratio': 0.5}",0.877828,0.868778,0.85,...,0.861139,0.010468,13,0.868331,0.868331,0.862812,0.875283,0.871882,0.869328,0.004158
3,0.067214,0.002881,0.01329,0.000734,0.1,0.75,"{'mdl__C': 0.1, 'mdl__l1_ratio': 0.75}",0.868778,0.864253,0.840909,...,0.856606,0.009608,14,0.860386,0.861521,0.862812,0.869615,0.867347,0.864336,0.003543
4,0.06468,0.009626,0.012675,0.001249,0.1,1.0,"{'mdl__C': 0.1, 'mdl__l1_ratio': 1.0}",0.855204,0.859729,0.845455,...,0.852986,0.004867,15,0.855846,0.858116,0.857143,0.861678,0.858277,0.858212,0.001937
5,0.077879,0.006796,0.012025,0.000442,316.227766,0.0,"{'mdl__C': 316.22776601683796, 'mdl__l1_ratio'...",0.846154,0.859729,0.881818,...,0.861176,0.011865,6,0.888763,0.888763,0.888889,0.888889,0.891156,0.889292,0.000934
6,0.091146,0.012347,0.013348,0.000939,316.227766,0.25,"{'mdl__C': 316.22776601683796, 'mdl__l1_ratio'...",0.850679,0.859729,0.881818,...,0.862081,0.01081,3,0.888763,0.888763,0.888889,0.888889,0.891156,0.889292,0.000934
7,0.084836,0.010262,0.012384,0.001326,316.227766,0.5,"{'mdl__C': 316.22776601683796, 'mdl__l1_ratio'...",0.850679,0.859729,0.881818,...,0.862081,0.01081,3,0.888763,0.888763,0.888889,0.888889,0.891156,0.889292,0.000934
8,0.091456,0.014639,0.012146,0.001245,316.227766,0.75,"{'mdl__C': 316.22776601683796, 'mdl__l1_ratio'...",0.846154,0.859729,0.881818,...,0.861176,0.011865,6,0.888763,0.888763,0.890023,0.888889,0.891156,0.889519,0.000946
9,0.094151,0.012254,0.012719,0.000793,316.227766,1.0,"{'mdl__C': 316.22776601683796, 'mdl__l1_ratio'...",0.846154,0.859729,0.881818,...,0.861176,0.011865,6,0.888763,0.888763,0.890023,0.888889,0.891156,0.889519,0.000946
