In [3]:
import pandas as  pd 
import numpy as np 
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import  XGBClassifier
from sklearn.metrics import f1_score


In [4]:
data = pd.read_csv("churn_data.csv")
data.shape


(7043, 28)

In [5]:
data.drop(['customerID', 'gender','PhoneService', 'MultipleLines',
           'StreamingTV', 'StreamingMovies', 'Partner', 'Dependents'] , axis = 1 , inplace = True)


In [6]:
data["Churn"] = data["Churn"].map({"Yes" : 1 , "No" : 0}).astype(int)

In [7]:

binary_cols=["OnlineSecurity" , "OnlineBackup" , "DeviceProtection" , "TechSupport" , "PaperlessBilling"]

for cols  in binary_cols :
    if cols in data.columns : 
        data[cols] = data[cols].map({"Yes" : 1 , "No" : 0 ,"No internet service" : 0}).astype(int)


In [8]:
multiclass_cols = ["InternetService" , "Contract" , "PaymentMethod" , "Tenure_bucket"]

data = pd.get_dummies (
    data , columns = [cols for cols in multiclass_cols if cols in data.columns] , drop_first = True
)

In [9]:
bool_cols = data.select_dtypes(include="bool").columns
data[bool_cols] = data[bool_cols].astype(int)

In [10]:
data = data.apply(pd.to_numeric, errors="coerce")
data.fillna(data.median(numeric_only=True), inplace=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   OnlineSecurity                         7043 non-null   int32  
 3   OnlineBackup                           7043 non-null   int32  
 4   DeviceProtection                       7043 non-null   int32  
 5   TechSupport                            7043 non-null   int32  
 6   PaperlessBilling                       7043 non-null   int32  
 7   MonthlyCharges                         7043 non-null   float64
 8   TotalCharges                           7043 non-null   float64
 9   Churn                                  7043 non-null   int32  
 10  avg_monthly_spend                      7043 non-null   float64
 11  Earl

In [12]:
x = data.drop(["Churn"] , axis= 'columns')
y = data["Churn"]

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [14]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("model",   XGBClassifier(n_estimators=300,max_depth=4,learning_rate=0.05,subsample=0.8,colsample_bytree=0.8,objective="binary:logistic",
                              eval_metric="auc", random_state=42))
    
])


In [15]:
pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
y_proba = pipeline.predict_proba(x_test)[:, 1]

In [17]:
thresholds = np.linspace(0.2, 0.6, 100)
f1s = []

for t in thresholds:
    y_pred = (y_proba > t).astype(int)
    f1s.append(f1_score(y_test, y_pred))

best_threshold = thresholds[np.argmax(f1s)]
best_threshold

0.3939393939393939

In [18]:
y_pred_opt = (y_proba > best_threshold).astype(int)

In [19]:
feature_columns = x.columns.tolist()
feature_columns

['SeniorCitizen',
 'tenure',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'PaperlessBilling',
 'MonthlyCharges',
 'TotalCharges',
 'avg_monthly_spend',
 'Early_high_risk',
 'life_time_risk_score',
 'Service_count',
 'Price_shook',
 'customer_value',
 'InternetService_Fiber optic',
 'InternetService_No',
 'Contract_One year',
 'Contract_Two year',
 'PaymentMethod_Credit card (automatic)',
 'PaymentMethod_Electronic check',
 'PaymentMethod_Mailed check',
 'Tenure_bucket_long',
 'Tenure_bucket_loyal',
 'Tenure_bucket_new']

In [43]:
customer_input = {
    'SeniorCitizen': 0,
    'tenure': 65,

    'OnlineSecurity': 0,
    'OnlineBackup': 1,
    'DeviceProtection': 0,
    'TechSupport': 0,

    'PaperlessBilling': 0,

    'MonthlyCharges': 500,
    'TotalCharges': 1000,
    'avg_monthly_spend': 300,

    'InternetService_Fiber optic': 1,
    'InternetService_No': 0,

    'Contract_One year': 0,
    'Contract_Two year': 0,

}


In [46]:
def predict_churn(pipeline, customer_input, feature_columns):

    df = pd.DataFrame([customer_input])
    
    for col in feature_columns:
        if col not in df.columns:
            df[col] = 0
    df = df[feature_columns]
    pred =  int(pipeline.predict(df)[0]),
    prob = pipeline.predict_proba(df)[0][1]*100 

            
    return {
        "prediction": "Yes" if pred == 1 else "No" ,
        "probability":  round(prob,2) }


In [47]:
print(predict_churn(pipeline, customer_input, feature_columns))

{'prediction': 'No', 'probability': 23.8}


### Creating a directory , to save the model 

In [35]:
import os
os.makedirs("D:/Machine_learning_projects/Models", exist_ok=True)

In [36]:
import joblib

In [37]:
model_version = 1

model_path = os.path.join("D:/Machine_learning_projects/Models", f"churn_pipeline_v{model_version}.joblib")

In [38]:
joblib.dump(pipeline ,model_path)

['D:/Machine_learning_projects/Models\\churn_pipeline_v1.joblib']

## Saving the model with MetaData

In [39]:
model_version = 2
model_path = os.path.join("D:/Machine_learning_projects/Models", f"churn_pipeline_v{model_version}.joblib")

In [40]:
model_package = {"model": pipeline, "features": feature_columns, "threshold": 0.4, "version": model_version,
                    "trained_on": str(pd.Timestamp.now())  }


In [42]:
import joblib

model_package = {
    "model": pipeline,
    "features": x_train.columns.tolist()
}

joblib.dump(model_package, "churn_model.joblib")


['churn_model.joblib']