### Importing Essential Libraries 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data Preprocessing

In [3]:
df = pd.read_csv('customer_churn_large_dataset.csv')
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [4]:
df.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


In [5]:
df.dtypes

CustomerID                      int64
Name                           object
Age                             int64
Gender                         object
Location                       object
Subscription_Length_Months      int64
Monthly_Bill                  float64
Total_Usage_GB                  int64
Churn                           int64
dtype: object

In [6]:
df.shape

(100000, 9)

In [4]:
df.isna().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


In [6]:
df['Age'].mean()

44.02702

In [7]:
pd.get_dummies(df['Gender'])

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
99995,0,1
99996,1,0
99997,0,1
99998,1,0


In [8]:
gender = pd.get_dummies(df['Gender'], drop_first= True)
gender

Unnamed: 0,Male
0,1
1,0
2,0
3,0
4,0
...,...
99995,1
99996,0
99997,1
99998,0


In [9]:
df['Gender'] = gender

In [10]:
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,1,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,0,New York,1,48.76,172,0
2,3,Customer_3,24,0,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,0,Miami,3,97.94,297,1
4,5,Customer_5,46,0,Miami,19,58.14,266,0


### Feature Selection

In [11]:
# Here we remove catageorical values and use only numerical values.

X = df.drop(['CustomerID', 'Name', 'Churn', 'Location'], axis=1)
X

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,1,17,73.36,236
1,62,0,1,48.76,172
2,24,0,5,85.47,460
3,36,0,3,97.94,297
4,46,0,19,58.14,266
...,...,...,...,...,...
99995,33,1,23,55.13,226
99996,62,0,19,61.65,351
99997,64,1,17,96.11,251
99998,51,0,20,49.25,434


In [12]:
y = df['Churn']
y

0        0
1        0
2        0
3        1
4        0
        ..
99995    1
99996    0
99997    1
99998    1
99999    1
Name: Churn, Length: 100000, dtype: int64

In [13]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 100000 entries, 0 to 99999
Series name: Churn
Non-Null Count   Dtype
--------------   -----
100000 non-null  int64
dtypes: int64(1)
memory usage: 781.4 KB


### Normalization

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_scaled

array([[0.86538462, 1.        , 0.69565217, 0.61942857, 0.41333333],
       [0.84615385, 0.        , 0.        , 0.268     , 0.27111111],
       [0.11538462, 0.        , 0.17391304, 0.79242857, 0.91111111],
       ...,
       [0.88461538, 1.        , 0.69565217, 0.94442857, 0.44666667],
       [0.63461538, 0.        , 0.82608696, 0.275     , 0.85333333],
       [0.17307692, 0.        , 0.7826087 , 0.66528571, 0.27333333]])

In [16]:
print(scaler.data_max_)
print(scaler.data_min_)

[ 70.   1.  24. 100. 500.]
[18.  0.  1. 30. 50.]


#### Splitting of data into test and train subsets.

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 10)

### Model Selection

#### 1. Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
model_lr = LogisticRegression()

In [19]:
model_lr.fit(X_train, y_train)

In [20]:
predict_lr  =  model_lr.predict(X_test)

In [21]:
print(pd.DataFrame(confusion_matrix(y_test, predict_lr), columns= ['Predicted No', 'Predicted Yes'], index= ['Actual No', 'Actual Yes']))
print(classification_report(y_test, predict_lr))

            Predicted No  Predicted Yes
Actual No           9913           5195
Actual Yes          9801           5091
              precision    recall  f1-score   support

           0       0.50      0.66      0.57     15108
           1       0.49      0.34      0.40     14892

    accuracy                           0.50     30000
   macro avg       0.50      0.50      0.49     30000
weighted avg       0.50      0.50      0.49     30000



#### 2. Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()

In [23]:
model_rf.fit(X_train, y_train)
predict_rf  =  model_rf.predict(X_test)

In [24]:
predict_rf

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [28]:
print("Accuracy :", accuracy_score(y_test, predict_rf)*100)
print(pd.DataFrame(confusion_matrix(y_test, predict_rf), columns= ['Predicted No', 'Predicted Yes'], index= ['Actual No', 'Actual Yes']))
print(classification_report(y_test, predict_rf))

Accuracy : 50.36000000000001
            Predicted No  Predicted Yes
Actual No           7918           7190
Actual Yes          7702           7190
              precision    recall  f1-score   support

           0       0.51      0.52      0.52     15108
           1       0.50      0.48      0.49     14892

    accuracy                           0.50     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.50      0.50      0.50     30000



#### 3. Decision Tree Classifier

In [33]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion= "gini", random_state= 100, max_depth= 6, min_samples_leaf= 8)
dt.fit(X_train, y_train)
predict_dt = dt.predict(X_test)

In [34]:
predict_dt

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [35]:
print("Score: ",dt.score(X_test,y_test))
print("Accuracy :", accuracy_score(y_test, predict_dt)*100)
print(pd.DataFrame(confusion_matrix(y_test, predict_dt), columns= ['Predicted No', 'Predicted Yes'], index= ['Actual No', 'Actual Yes']))
print(classification_report(y_test, predict_dt))

Score:  0.5027
Accuracy : 50.27
            Predicted No  Predicted Yes
Actual No          10974           4134
Actual Yes         10785           4107
              precision    recall  f1-score   support

           0       0.50      0.73      0.60     15108
           1       0.50      0.28      0.36     14892

    accuracy                           0.50     30000
   macro avg       0.50      0.50      0.48     30000
weighted avg       0.50      0.50      0.48     30000



##### Resampling

In [36]:
from imblearn.combine import SMOTEENN
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(X,y)

In [51]:
Xr_train1,Xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.3)

##### Logistic Regression

In [52]:
lr_smot = LogisticRegression()
lr_smot.fit(Xr_train1, yr_train1)


In [53]:
predict_lr_smot = lr_smot.predict(Xr_test1)

In [54]:
predict_lr_smot

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [55]:
print("Accuracy :", accuracy_score(yr_test1, predict_lr_smot)*100)
print("Confusion Matrix:\n", confusion_matrix(yr_test1, predict_lr_smot))
print(classification_report(yr_test1, predict_lr_smot))

Accuracy : 49.24764090793165
Confusion Matrix:
 [[ 632 1292]
 [ 698 1299]]
              precision    recall  f1-score   support

           0       0.48      0.33      0.39      1924
           1       0.50      0.65      0.57      1997

    accuracy                           0.49      3921
   macro avg       0.49      0.49      0.48      3921
weighted avg       0.49      0.49      0.48      3921



##### Random Forest

In [66]:
rf_smot = RandomForestClassifier()
rf_smot.fit(Xr_train1, yr_train1)

In [67]:
predict_rf_smot = rf_smot.predict(Xr_test1)
predict_rf_smot

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [68]:
print("Accuracy :", accuracy_score(yr_test1, predict_rf_smot)*100)
print("Confusion Matrix:\n", confusion_matrix(yr_test1, predict_rf_smot))
print(classification_report(yr_test1, predict_rf_smot))

Accuracy : 68.5539403213466
Confusion Matrix:
 [[1306  618]
 [ 615 1382]]
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      1924
           1       0.69      0.69      0.69      1997

    accuracy                           0.69      3921
   macro avg       0.69      0.69      0.69      3921
weighted avg       0.69      0.69      0.69      3921



##### Decision Tree

In [89]:
dt_smot = DecisionTreeClassifier(criterion= "gini", random_state= 100)
dt_smot.fit(Xr_train1, yr_train1)

In [90]:
predict_dt_smot = dt_smot.predict(Xr_test1)
predict_dt_smot

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [91]:
print("Accuracy :", accuracy_score(yr_test1, predict_dt_smot)*100)
print("Confusion Matrix:\n", confusion_matrix(yr_test1, predict_dt_smot))
print(classification_report(yr_test1, predict_dt_smot))

Accuracy : 61.59143075745983
Confusion Matrix:
 [[1167  757]
 [ 749 1248]]
              precision    recall  f1-score   support

           0       0.61      0.61      0.61      1924
           1       0.62      0.62      0.62      1997

    accuracy                           0.62      3921
   macro avg       0.62      0.62      0.62      3921
weighted avg       0.62      0.62      0.62      3921

