In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import joblib

In [31]:
df=pd.read_csv("churn.csv")
df.drop(columns="Unnamed: 0",inplace=True)
df[["day.charge","eve.mins"]]=df[["day.charge","eve.mins"]].apply(pd.to_numeric,errors="coerce")
df["day.charge"]=df["day.charge"].fillna(df['day.charge'].median())
df["eve.mins"]=df["eve.mins"].fillna(df['eve.mins'].median())
cat_col=df.columns[df.dtypes=='O']
num_col=df.columns[df.dtypes!='O']

# Function to remove outliers using IQR
def remove_outliers_iqr(df,column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

for col in num_col:
    df=remove_outliers_iqr(df,col)

df['voice.messages']= np.log1p(df['voice.messages'])
df['intl.calls']= np.log1p(df['intl.calls'])


def find_multicollinear_features(dataset, threshold=0.8):
    r = dataset.corr()  # Compute correlation matrix
    col_corr = set()  # Store highly correlated columns
    
    for i in range(len(r.columns)):
        for j in range(i):
            if abs(r.iloc[i, j]) > threshold:  # Check correlation magnitude
                col_name = r.columns[i]
                col_corr.add(col_name)  # Add column to set
                
    return col_corr

high_corr_features = find_multicollinear_features(df[num_col], 0.7)
df.drop(columns=['day.charge', 'intl.charge', 'night.charge', 'eve.charge'],inplace=True)
num_col_af=df.columns[df.dtypes!="O"]

df = pd.get_dummies(df, columns=['state'], drop_first=True)
df = pd.get_dummies(df, columns=['area.code'], drop_first=True)
scal=StandardScaler()
lab=LabelEncoder()
df[num_col_af]=scal.fit_transform(df[num_col_af])
## labeling for catagorical columns and target column
for col in df[['voice.plan', 'intl.plan', 'churn']]:
   df[col]=lab.fit_transform(df[col])
    
training_columns = df.drop(columns=['churn']).columns
x=df.drop(columns="churn")
y=df["churn"]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)  


xgb_model = XGBClassifier(subsample=0.9,n_estimators=100,max_depth=7,learning_rate=0.1,colsample_bytree=0.8)
xgb_model.fit(x_train, y_train)

# Save Model
joblib.dump(xgb_model,"xgb_model.pkl")
# Save Scaler
joblib.dump(scal, "scaler.pkl")
# Save Feature Names
joblib.dump(list(x_train.columns), "feature_names.pkl")

print("✅ Model, Scaler & Feature Names Saved!")



✅ Model, Scaler & Feature Names Saved!
