In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

df = pd.read_csv("Customer Churn.csv")
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [2]:
# Define the columns to keep
columns_to_keep = ['tenure', 'MonthlyCharges','TotalCharges', 'Contract', 'PaymentMethod', 'InternetService', 'Churn']

# Select only the specified columns
df = df.loc[:,columns_to_keep]

In [3]:
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,InternetService,Churn
0,1,29.85,29.85,Month-to-month,Electronic check,DSL,No
1,34,56.95,1889.5,One year,Mailed check,DSL,No
2,2,53.85,108.15,Month-to-month,Mailed check,DSL,Yes
3,45,42.3,1840.75,One year,Bank transfer (automatic),DSL,No
4,2,70.7,151.65,Month-to-month,Electronic check,Fiber optic,Yes


In [4]:
df['Contract'].value_counts()

Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

In [5]:
df['PaymentMethod'].value_counts()

PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64

In [6]:
df.shape

(7043, 7)

In [7]:
# Convert 'TotalCharges' column to float, and handle errors='coerce' to replace non-numeric values with NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


In [8]:
#use label encoder
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# List of columns to label encode
categorical_cols = ['InternetService', 'PaymentMethod','Contract', 'Churn']
# Apply label encoding to each column
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [9]:
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,InternetService,Churn
0,1,29.85,29.85,0,2,0,0
1,34,56.95,1889.5,1,3,0,0
2,2,53.85,108.15,0,3,0,1
3,45,42.3,1840.75,1,0,0,0
4,2,70.7,151.65,0,2,1,1


In [10]:
x = df.drop('Churn',axis = 1)
y = df['Churn']

In [11]:
# Split data into training and testing sets
X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic regression

In [13]:
model = RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [14]:
y_pred

array([1, 0, 0, ..., 0, 0, 0])

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(y_test,y_pred)

0.772888573456352

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(model,x,y,cv=5, scoring='accuracy')

array([0.76437189, 0.79347055, 0.76011356, 0.77414773, 0.78125   ])

In [18]:
import pickle
import gzip

# Save with gzip compression
with gzip.open("model.pkl.gz", "wb") as f:
    pickle.dump(model, f)

print("✅ Model saved successfully!")

✅ Model saved successfully!


In [19]:
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,InternetService,Churn
0,1,29.85,29.85,0,2,0,0
1,34,56.95,1889.5,1,3,0,0
2,2,53.85,108.15,0,3,0,1
3,45,42.3,1840.75,1,0,0,0
4,2,70.7,151.65,0,2,1,1


In [20]:
def prediction(tenure,TotalCharges, MonthlyCharges, InternetService, Contract, PaymentMethod):
    data = {
    'tenure': [tenure],
    'TotalCharges': [TotalCharges],
    'MonthlyCharges': [MonthlyCharges],
    'InternetService': [InternetService],
    'Contract': [Contract],
    'PaymentMethod': [PaymentMethod]
    }
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)


    # Encode the categorical columns
    categorical_columns = ['InternetService', 'Contract']
    for column in categorical_columns:
        df[column] = label_encoder.fit_transform(df[column])
    df = scaler.fit_transform(df)

    result = model.predict(df).reshape(1,-1)
    return result[0]