In [9]:
import numpy as np
import pandas as pd

In [10]:
df = pd.read_csv('customer_churn_large_dataset.csv')
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [11]:
df.isnull().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [12]:
df.drop(columns = ['CustomerID','Name'], axis = 1, inplace = True)


In [13]:
df.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,Male,Los Angeles,17,73.36,236,0
1,62,Female,New York,1,48.76,172,0
2,24,Female,Los Angeles,5,85.47,460,0
3,36,Female,Miami,3,97.94,297,1
4,46,Female,Miami,19,58.14,266,0


In [14]:
categorical_variables = [col for col in df.columns if col in "O"
                        or df[col].nunique() <=11
                        and col not in "Churn"]

categorical_variables

['Gender', 'Location']

In [15]:
numeric_variables = [col for col in df.columns if df[col].dtype != "object"
                        and df[col].nunique() >11
                        and col not in "CustomerID"]
numeric_variables

['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']

In [16]:
outliers = []
def outliers_iqr(col):
    col = sorted(col)
    q1 = np.percentile(col, 25)
    q3 = np.percentile(col, 75)
   
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    
    for i in col: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
        
    return outliers

In [17]:
Outliers = outliers_iqr(df['Age'])
print("Outliers from IQR method =", outliers)

Outliers from IQR method = []


In [18]:
Outliers = outliers_iqr(df['Subscription_Length_Months'])
print("Outliers from IQR method =", outliers)

Outliers from IQR method = []


In [19]:
Outliers = outliers_iqr(df['Monthly_Bill'])
print("Outliers from IQR method =", outliers)


Outliers from IQR method = []


In [20]:
Outliers = outliers_iqr(df['Total_Usage_GB'])
print("Outliers from IQR method =", outliers)

Outliers from IQR method = []


In [21]:
from sklearn.preprocessing import LabelEncoder





In [22]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Location'] = le.fit_transform(df['Location'])
df.head()


Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,1,2,17,73.36,236,0
1,62,0,4,1,48.76,172,0
2,24,0,2,5,85.47,460,0
3,36,0,3,3,97.94,297,1
4,46,0,3,19,58.14,266,0


In [23]:
X = df.drop('Churn', axis = 1)
X

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,1,2,17,73.36,236
1,62,0,4,1,48.76,172
2,24,0,2,5,85.47,460
3,36,0,3,3,97.94,297
4,46,0,3,19,58.14,266
...,...,...,...,...,...,...
99995,33,1,1,23,55.13,226
99996,62,0,4,19,61.65,351
99997,64,1,0,17,96.11,251
99998,51,0,4,20,49.25,434


In [24]:
y = df['Churn'].values
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10) 

In [26]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [29]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 20, max_depth = 10, random_state = 10)
rfc.fit(X_train, y_train)
y_pred_1 = rfc.predict(X_test)
accuracy_score(y_test, y_pred_1)

0.4967

In [30]:
print(classification_report(y_test,y_pred_1))

              precision    recall  f1-score   support

           0       0.50      0.57      0.53     10076
           1       0.49      0.43      0.46      9924

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.49     20000
weighted avg       0.50      0.50      0.49     20000



In [31]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 20)
lr.fit(X_train, y_train)
y_pred_2 = lr.predict(X_test)
accuracy_score(y_test, y_pred_2)

0.49795

In [32]:
print(classification_report(y_test,y_pred_2))

              precision    recall  f1-score   support

           0       0.50      0.62      0.55     10076
           1       0.49      0.38      0.43      9924

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.49     20000
weighted avg       0.50      0.50      0.49     20000



In [52]:
from sklearn.neighbors import KNeighborsClassifier  
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_3 = knn.predict(X_test)
accuracy_score(y_test, y_pred_3)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.49855

In [53]:
print(classification_report(y_test,y_pred_3))

              precision    recall  f1-score   support

           0       0.50      0.50      0.50     10076
           1       0.49      0.50      0.50      9924

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [54]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 20)
dtc.fit(X_train, y_train)
y_pred_4 = dtc.predict(X_test)
accuracy_score(y_test, y_pred_4)

0.49755

In [55]:
print(classification_report(y_test,y_pred_4))

              precision    recall  f1-score   support

           0       0.50      0.49      0.50     10076
           1       0.49      0.51      0.50      9924

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [33]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state = 20)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
accuracy_score(y_test, y_pred)

0.4986

In [34]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.50      0.58      0.54     10076
           1       0.49      0.42      0.45      9924

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [35]:
from sklearn.model_selection import cross_val_score

In [50]:
k_folds = KFold(n_splits = 20)

scores = cross_val_score(gbc, X_train, y_train, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.50175 0.51075 0.49525 0.496   0.50125 0.49525 0.4895  0.49575 0.4905
 0.509   0.504   0.5005  0.49875 0.51875 0.51475 0.50875 0.4965  0.51475
 0.504   0.496  ]
Average CV Score:  0.5020874999999999
Number of CV Scores used in Average:  20


In [36]:
confusion_matrix(y_test, y_pred)

array([[5812, 4264],
       [5764, 4160]], dtype=int64)

In [46]:
Age =  float(input("Enter age  :: (Range is 10 to 100 ) : "))
Gender = float(input("Enter Gender :: (1 for Male , 0 for Female ) : "))
Location = float(input("Enter Location :: (Range is 0 to 4) : "))
Sub = float(input("Enter Subscription Length Moths :: (Range is 0 to 25) : "))
Month = float(input("Enter Monthly Bill :: (Range is 0 to 100) "))
GB = float(input("Enter Total usage of GB :: (Range is 0 to 500) : "))
data = (Age, Gender, Location, Sub, Month, GB)
input_Data = np.asarray(data)
input_data_reshape = input_Data.reshape(1,-1)
std_data = scaler.transform(input_data_reshape)
ans = gbc.predict(std_data)
if (ans[0] == 0):  
    print('Customer will not leave')
else:  
    print('Customer will leave')

Enter age  :: (Range is 10 to 100 ) : 33
Enter Gender :: (1 for Male , 0 for Female ) : 1
Enter Location :: (Range is 0 to 4) : 3
Enter Subscription Length Moths :: (Range is 0 to 25) : 3
Enter Monthly Bill :: (Range is 0 to 100) 44
Enter Total usage of GB :: (Range is 0 to 500) : 55
Customer will not leave


