<h3>Solving Classification Problems Using Scikit Learn</h3>

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = [10, 6]

In [2]:
churn_df = pd.read_csv('./datasets/customer_churn.csv')
churn_df.head(30)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [4]:
# Removing unnecessary columns
churn_df = churn_df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

<p>Dividing Data into Features and Labels</p>

In [5]:
# Creating the feature set
x = churn_df.drop(['Exited'], axis=1)

# Creating a label set
y = churn_df['Exited']

In [6]:
x.head(30)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1
5,645,Spain,Male,44,8,113755.78,2,1,0,149756.71
6,822,France,Male,50,7,0.0,2,1,1,10062.8
7,376,Germany,Female,29,4,115046.74,4,1,0,119346.88
8,501,France,Male,44,4,142051.07,2,0,1,74940.5
9,684,France,Male,27,2,134603.88,1,1,1,71725.73


In [7]:
y.head(30)

0     1
1     0
2     1
3     0
4     0
5     1
6     0
7     1
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    1
17    0
18    0
19    0
20    0
21    0
22    1
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: Exited, dtype: int64

<p>Converting Categorical Data into Numbers</p>

In [8]:
# Dropping categorical columns
numerical = x.select_dtypes(include=['int64', 'float64'])
numerical.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,42,2,0.0,1,1,1,101348.88
1,608,41,1,83807.86,1,0,1,112542.58
2,502,42,8,159660.8,3,1,0,113931.57
3,699,39,1,0.0,2,0,0,93826.63
4,850,43,2,125510.82,1,1,1,79084.1


In [9]:
categorical = x.select_dtypes(include=['object'])
categorical.head()

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


In [11]:
# Converting categorical columns to one hot encoded columns
import pandas as pd

cat_numerical = pd.get_dummies(categorical, drop_first=True, dtype=int)
cat_numerical.head()

Unnamed: 0,Geography_Germany,Geography_Spain,Gender_Male
0,0,0,0
1,0,1,0
2,0,0,0
3,0,0,0
4,0,1,0


In [12]:
# Concatenating numerical columns with one hot encoded columns
x = pd.concat([numerical, cat_numerical], axis=1)
x.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


<p>Dividing the data into training and test sets</p>

In [13]:
# Dividing data into the training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

<p>Data Scaling and Normalization</p>

In [14]:
# Applying standard scaling to the dataset
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

<h5>Solving Classification Problems</h5>

<p>Logistic Regression</p>

In [15]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

# Training the logistic regression classifier
classifier = log_clf.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier.predict(x_test)

In [16]:
# Evaluating the algorithm on the test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1543   64]
 [ 314   79]]
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

0.811


<p>KNN Classifier</p>

In [17]:
# Importing KNN classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)

# Training the KNN classifier
classifier = knn_clf.fit(x_train, y_train)

# Making Predictions on the test set
y_pred = classifier.predict(x_test)

# Evaluating the algorithm on the test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[1514   93]
 [ 247  146]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      1607
           1       0.61      0.37      0.46       393

    accuracy                           0.83      2000
   macro avg       0.74      0.66      0.68      2000
weighted avg       0.81      0.83      0.81      2000

0.83


<p>Random Forest Classifier</p>

In [18]:
# Importing random forest classifier from sklearn
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)

# Training the random forest classifier
classifier = rf_clf.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier.predict(x_test)

# Evaluating the algorithm on the test set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

[[1548   59]
 [ 208  185]]
0.8665


<p>K-Fold Cross Validation</p>

In [19]:
# Importing cross-validation model from sklearn
from sklearn.model_selection import cross_val_score

# Applying 5 fold cross validation
print(cross_val_score(classifier, x, y, cv=5, scoring='accuracy'))

[0.864  0.8725 0.8625 0.863  0.8625]
