# Taking the Data

In [1]:
import pandas as pd

In [2]:
file_path = "C:\\Users\\mh183\\OneDrive\\Documents\\customer_churn_large_dataset.csv"
data = pd.read_csv(file_path)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


In [4]:
data.drop(['CustomerID', 'Name'], axis=1, inplace=True)

In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [6]:
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data = pd.get_dummies(data, columns=['Location'], drop_first=True)

# checking the missing data

first we have to check ,Is data missing or not

In [7]:
missing_data = data.isnull().sum()

In [8]:
missing_data

Age                           0
Gender                        0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
Location_Houston              0
Location_Los Angeles          0
Location_Miami                0
Location_New York             0
dtype: int64

# For Handling the missing data

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imputer = SimpleImputer(strategy='median')
columns_to_impute = ['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']
data[columns_to_impute] = imputer.fit_transform(data[columns_to_impute])

# Now to handling the  outliers

Here we remove the outliers instead of impute{because,not want to train the model with wrong data}

In [11]:
import numpy as np

In [12]:
Q1 = data[['Age','Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']].quantile(0.25)
Q3 = data[['Age','Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']].quantile(0.75)
IQR = Q3 - Q1

# Define a mask for outliers
outlier_mask = ((data[['Age','Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']] < (Q1 - 1.5 * IQR)) | (data[['Age','Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']] > (Q3 + 1.5 * IQR)))

# Remove outliers from the dataset
data_no_outliers = data[~outlier_mask]

In [13]:
# if you want to save the data  

# data_no_outliers.to_csv('clean_data.csv', index=False)

# Train and Test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [16]:
train_set

Unnamed: 0,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
75220,54.0,0,5.0,84.50,205.0,1,False,False,False,True
48955,28.0,1,24.0,82.06,239.0,1,False,False,False,True
44966,57.0,1,12.0,52.29,62.0,1,False,False,False,False
13568,19.0,1,19.0,32.57,173.0,1,True,False,False,False
92727,56.0,0,8.0,33.52,314.0,1,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
6265,35.0,1,21.0,67.33,235.0,0,False,False,True,False
54886,56.0,1,13.0,85.40,347.0,0,False,False,False,False
76820,69.0,1,2.0,76.24,321.0,1,True,False,False,False
860,55.0,1,12.0,89.19,315.0,1,False,False,False,False


In [17]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [18]:
x_train = train_set.drop("Churn", axis=1)
y_train = train_set["Churn"].copy()
x_test = test_set.drop("Churn", axis=1) 
y_test = test_set["Churn"].copy()

In [19]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [20]:
x_test.shape
y_test.shape

(20000,)

In [21]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [22]:
clf = HistGradientBoostingClassifier()

In [23]:
clf.fit(x_train, y_train)

found 0 physical cores < 1
  File "C:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [24]:
y_pred = clf.predict(x_test)
y_pred

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [25]:
from sklearn.model_selection import cross_val_score

In [26]:
score = cross_val_score(clf, x_train, y_train, scoring="neg_mean_squared_error", cv=30)
qrt_score = np.sqrt(-score)
print(qrt_score)

[0.70591268 0.70750437 0.70325186 0.70111594 0.70191767 0.70988521
 0.70617821 0.70882804 0.69950971 0.71331017 0.70511549 0.70882804
 0.71225809 0.70538132 0.71776431 0.7069742  0.7109408  0.70084849
 0.70458353 0.70511549 0.71291799 0.71212835 0.70231639 0.70657612
 0.70737196 0.69964089 0.71133782 0.71423213 0.71396949 0.71081032]


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [28]:
print("X_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (20000, 9)
y_test shape: (20000,)


In [29]:
y = data['Churn']
threshold_value = 0.5
y_binary = (y > threshold_value).astype(int)

In [30]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# using joblib to dump the file

In [31]:
# from joblib import dump

In [32]:
# dump(clf, 'Customer Churn Prediction.py')

In [41]:
def predict_churn(new_data):
    missing_features = set(x_train.columns) - set(new_data.columns)
    for feature in missing_features:
        new_data[feature] = 0  
    # Preprocess the new data
    new_data['Gender'] = label_encoder.fit_transform(new_data['Gender'])
    new_data = pd.get_dummies(new_data, columns=['Location'], drop_first=True)
#     new_data = new_data[feature_names]
    new_data_scaled = scaler.fit_transform(new_data)
    churn_predictions = clf.predict(new_data_scaled)
    return churn_predictions

In [42]:
    age = int(input("Enter customer age: "))
    gender = input("Enter customer gender (Male/Female): ")
    location = input("Enter customer location: ")
    subscription_length = int(input("Enter subscription length in months: "))
    monthly_bill = float(input("Enter monthly bill amount: "))
    total_usage_gb = float(input("Enter total usage in GB: "))

Enter customer age: 34
Enter customer gender (Male/Female): Female
Enter customer location: Miami
Enter subscription length in months: 21
Enter monthly bill amount: 50.21
Enter total usage in GB: 299


In [43]:
import pandas as pd
new_customer_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Location': [location],
        'Subscription_Length_Months': [subscription_length],
        'Monthly_Bill': [monthly_bill],
        'Total_Usage_GB': [total_usage_gb]
 })

In [44]:
churn_prediction = predict_churn(new_customer_data)
print("Churn Prediction:", churn_prediction)

Churn Prediction: [0]


