<a href="https://colab.research.google.com/github/MohammedMujtaba/Deep-Learning-Projects/blob/main/Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# M.Mujtaba Cs182020
### Customer Churning Using Deep Learning
Dataset Link: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

Churn = customers lost over a period of time
For detailed data exploring, check this article: https://towardsdatascience.com/churn-prediction-with-machine-learning-ca955d52bd8c

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [5]:
df = pd.read_csv("/content/Churn_Modelling.csv")
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9980,9981,15719276,T'ao,741,Spain,Male,35,6,74371.49,1,0,0,99595.67,0
5429,5430,15797905,Walker,682,France,Female,48,7,0.0,2,1,0,65069.03,0
1838,1839,15758813,Campbell,350,Germany,Male,39,0,109733.2,2,0,0,123602.11,1
5575,5576,15782879,Lang,656,France,Male,40,2,0.0,2,1,1,180553.48,0
1362,1363,15683841,Hamilton,555,Germany,Male,41,10,113270.2,2,1,1,185387.14,0


In [6]:
df.drop(columns=['RowNumber','CustomerId', 'Surname'],axis='columns',inplace=True)

In [7]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [8]:
#Quick glance at above makes me realize that TotalCharges should be float but it is an object. 
#Let's check what's going on with this column
df.EstimatedSalary.values

array([101348.88, 112542.58, 113931.57, ...,  42085.58,  92888.52,
        38190.78])

In [9]:

pd.to_numeric(df.EstimatedSalary)

0       101348.88
1       112542.58
2       113931.57
3        93826.63
4        79084.10
          ...    
9995     96270.64
9996    101699.77
9997     42085.58
9998     92888.52
9999     38190.78
Name: EstimatedSalary, Length: 10000, dtype: float64

In [10]:
# Creating a function to iterate through columns that consist of logical values:
def print_unique_col_values(df):
  for column in df:
    if df[column].dtypes=='object':
      print(f'{column}: {df[column].unique()}')

In [11]:
print_unique_col_values(df)

Geography: ['France' 'Spain' 'Germany']
Gender: ['Female' 'Male']


In [12]:
df['Gender'].replace({'Female':1,'Male':0},inplace=True)

In [13]:
for col in df:
  print(f'{col}: {df[col].unique()}')

CreditScore: [619 608 502 699 850 645 822 376 501 684 528 497 476 549 635 616 653 587
 726 732 636 510 669 846 577 756 571 574 411 591 533 553 520 722 475 490
 804 582 472 465 556 834 660 776 829 637 550 698 585 788 655 601 656 725
 511 614 742 687 555 603 751 581 735 661 675 738 813 657 604 519 664 678
 757 416 665 777 543 506 493 652 750 729 646 647 808 524 769 730 515 773
 814 710 413 623 670 622 785 605 479 685 538 562 721 628 668 828 674 625
 432 770 758 795 686 789 589 461 584 579 663 682 793 691 485 650 754 535
 716 539 706 586 631 717 800 683 704 615 667 484 480 578 512 606 597 778
 514 525 715 580 807 521 759 516 711 618 643 671 689 620 676 572 695 592
 567 694 547 594 673 610 767 763 712 703 662 659 523 772 545 634 739 771
 681 544 696 766 727 693 557 531 498 651 791 733 811 707 714 782 775 799
 602 744 588 747 583 627 731 629 438 642 806 474 559 429 680 749 734 644
 626 649 805 718 840 630 654 762 568 613 522 737 648 443 640 540 460 593
 801 611 802 745 483 690 492 709 705 5

In [14]:
#One hot encoding other columns
df1 = pd.get_dummies(data=df, columns=['Geography'])
df1.columns

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited',
       'Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype='object')

In [15]:
#In Deep Learning, it is imp to scale the data to ensure that it is between some range
df1.sample(5)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
1153,755,1,45,7,135643.0,1,0,0,143619.52,1,0,1,0
7542,507,1,34,4,0.0,2,1,1,60688.38,0,0,0,1
6645,850,0,37,4,126872.6,1,1,0,197266.58,0,1,0,0
3158,643,1,31,3,167949.48,1,1,0,143162.34,0,1,0,0
9976,637,1,33,7,103377.81,1,1,0,84419.78,0,1,0,0


In [16]:
cols_to_scale = ['CreditScore','Balance','EstimatedSalary', 'Age', 'Tenure', 'NumOfProducts']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df1[cols_to_scale] = scaler.fit_transform(df1[cols_to_scale])

In [17]:
df1.sample(5)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
2562,0.826,1,0.540541,0.9,0.748956,0.0,0,1,0.179085,0,1,0,0
2973,0.482,1,0.513514,0.9,0.513685,0.0,1,1,0.981246,1,1,0,0
3648,0.39,0,0.391892,0.8,0.421655,0.0,0,1,0.339125,1,1,0,0
4098,0.574,0,0.189189,0.5,0.0,0.0,0,0,0.743859,0,1,0,0
9022,0.724,0,0.067568,0.2,0.0,0.333333,0,1,0.333953,0,1,0,0


In [18]:
#train test split for our ANN
X = df1.drop('Exited',axis='columns')
y = df1['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)

In [19]:
X_train.shape,X_test.shape,X_train[:10]

((8000, 12),
 (2000, 12),
       CreditScore  Gender       Age  Tenure   Balance  NumOfProducts  \
 7751        0.800       1  0.283784     0.6  0.000000       0.333333   
 4154        0.752       0  0.216216     0.3  0.000000       0.333333   
 3881        0.476       1  0.621622     0.3  0.000000       0.000000   
 9238        0.846       1  0.432432     0.4  0.000000       0.333333   
 5210        0.402       0  0.229730     0.7  0.517012       0.333333   
 7487        0.602       1  0.513514     0.4  0.000000       0.000000   
 7542        0.314       1  0.216216     0.4  0.000000       0.333333   
 7524        0.620       1  0.297297     0.8  0.666330       0.000000   
 9412        0.750       0  0.108108     0.6  0.393324       0.000000   
 6377        0.684       0  0.202703     0.9  0.000000       0.000000   
 
       HasCrCard  IsActiveMember  EstimatedSalary  Geography_France  \
 7751          0               0         0.096273                 0   
 4154          1           

In [20]:
import tensorflow as tf
from tensorflow import keras


#model = keras.Sequential([
   # keras.layers.Dense(26, input_shape=(26,)), #input layer
  #  keras.layers.Dense(15, activation='relu'), #hidden layer
 #   keras.layers.Dense(1, activation='sigmoid') #ouput layer
#])

#or this will also work
model = keras.Sequential([
    keras.layers.Dense(12, input_shape=(12,), activation='relu'),
    keras.layers.Dense(1, activation='sigmoid') #ouput layer
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7fbfc6747dd0>

In [22]:
yp = model.predict(X_test)
yp[:5]

array([[0.03136799],
       [0.07831656],
       [0.05684077],
       [0.07911096],
       [0.07312907]], dtype=float32)

In [24]:
#Convert probabilistic values to 0,1
y_pred = []
for element in yp:
  if element > 0.5:
    y_pred.append(1)
  else:
    y_pred.append(0)

In [25]:
y_pred[:5], y_test[:5]

([0, 0, 0, 0, 0], 7054    0
 442     0
 3954    0
 2288    0
 3196    0
 Name: Exited, dtype: int64)

In [26]:
#Evaluating the prediction
from sklearn.metrics import confusion_matrix , classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1595
           1       0.71      0.46      0.56       405

    accuracy                           0.85      2000
   macro avg       0.79      0.71      0.74      2000
weighted avg       0.84      0.85      0.84      2000

