In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier  


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import numpy as np


param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}


xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,             # number of random combinations
    scoring='roc_auc',     # maximize AUC
    cv=3,                  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(x_train_scaled, y_train)

print("Best Hyperparameters:", random_search.best_params_)
print("Best AUC:", random_search.best_score_)

best_xgb = random_search.best_estimator_
y_pred_best = best_xgb.predict(x_test_scaled)
y_proba_best = best_xgb.predict_proba(x_test_scaled)[:, 1]

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

print("Test Accuracy:", accuracy_score(y_test, y_pred_best))
print("Test AUC:", roc_auc_score(y_test, y_proba_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))



In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier  


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)



#XGBOOST MODEL
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

# Train XGBoost model
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

xgb.fit(x_train_scaled, y_train)

# Predictions
xgb_pred = xgb.predict(x_test_scaled)
xgb_proba = xgb.predict_proba(x_test_scaled)[:, 1]

# Evaluation
xgb_auc = roc_auc_score(y_test, xgb_proba)
xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_cm = confusion_matrix(y_test, xgb_pred)

print("AUC:", xgb_auc)
print("Accuracy:", xgb_acc)
print("Confusion Matrix:\n", xgb_cm)
print("\nClassification Report:\n", classification_report(y_test, xgb_pred))


In [None]:
#Evaluation of RF
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier  


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)


rf_model = RandomForestClassifier(
    n_estimators=100,       
    max_depth= None,      
    random_state=42
)

rf_model.fit(x_train_scaled,y_train)
y_pred_rf = rf_model.predict(x_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(x_test_scaled)[:, 1] 

#Feature importance
feature_importances = pd.DataFrame({
    'Feature': x.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

# Plot top 10 features
feature_importances.head(10).plot(kind='barh', x='Feature', y='Importance', legend=False, color='skyblue')
plt.title("Top 10 Important Features for Churn")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
#Evaluation of RF
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier  


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)


rf_model = RandomForestClassifier(
    n_estimators=100,       
    max_depth= None,      
    random_state=42
)

rf_model.fit(x_train_scaled,y_train)
y_pred_rf = rf_model.predict(x_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(x_test_scaled)[:, 1] 


# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# ROC Curve & AUC
auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_rf)

plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], linestyle='--')
plt.title(f"Random Forest ROC Curve (AUC = {auc_rf:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)


from sklearn.ensemble import RandomForestClassifier   #using random forest classification to test the model again
rf_model = RandomForestClassifier(
    n_estimators=100,       # number of trees   100 decision trees in the forest
    max_depth= None,        # trees can grow until leaves are pure 
    random_state=42
)

#train the model
rf_model.fit(x_train_scaled,y_train)

#make predictions
y_pred_rf = rf_model.predict(x_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(x_test_scaled)[:, 1]  # churn probability

print(y_pred_proba_rf)

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled) 
y_pred_prob = model.predict_proba(x_test_scaled)[:,1]

from sklearn.metrics import roc_auc_score, roc_curve  #the ROC  Curve synonymous to the t-distribution
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)

plt.plot(fpr,tpr)
plt.plot([0,1], [0,1], linestyle='--')
plt.title(f"ROC Curve (AUC = {auc:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()




In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  #
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled) 
y_pred_prob = model.predict_proba(x_test_scaled)[:,1]

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))   #classification report evaluates how well your model did on the test set
#precision of all the customers the model predicted would churn, how many actually churned?
#recall(sensitivity) of all the customers who actually churned how many did the model correctly predict?  *most important metric
#F1 score a balance between precision and recall. Useful if you care about both false alarms and misses.
#support the actual number of samples in y_test

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  #
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled) 
y_pred_prob = model.predict_proba(x_test_scaled)[:,1]

cm = confusion_matrix(y_test, y_pred) #gives the confusion matrix for the predictions
cm   # TN FP FN TP   we want to reduce FN as much as possible because it is a dangerous decision

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  #
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)


#Make predictions
y_pred = model.predict(x_test_scaled) 
y_pred_prob = model.predict_proba(x_test_scaled)[:,1] #Gives the Probability that each customer will churn 0-1
print(y_pred_prob)



In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  #
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled) 

#Interpretation of the model using logistic regression 
import numpy as np 
coeffs = pd.DataFrame({
    'Feature':x.columns,      #-> list of all input feature columns         
    'Coefficient': model.coef_[0]  #-> the weights/importance learned by the logistic regression model each feature gets a co efficient 
})

coeffs.sort_values(by='Coefficient', ascending=False)    #Features close to 0 have very little impact




In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  #
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled) 

#Evaluation of the model to determine how good the model is so we use: Accuracy_score, Confusion_matrix, Classification_report 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print('accuracy:', accuracy_score(y_pred,y_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=1000)
model.fit (x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled) #to make predictions for the test


In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  
x_test_scaled = scaler.transform(x_test)


#brings in the ML library used for modelling 
from sklearn.linear_model import LogisticRegression   #the logistic regression fucntion is used to estimate the probability that an observation belongs to a particular class.
model = LogisticRegression(max_iter=1000) #trys the code up to 1000 times to find the best possible solution
model.fit (x_train_scaled,y_train) # .fit() teaches the model patterns from the data  x_train-> the input features, y_train-> the output target

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  #Numeric features like MonthlyPremium or PolicyTenure may vary in range. Scaling helps models converge faster.
x_test_scaled = scaler.transform(x_test)



In [None]:
import pandas as pd 
df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)
x = df_encoded.drop('churn', axis=1)
y = df_encoded['churn'] 


#the train_test_split function splits the dataset into a training and testing sets
#test_size=0.2 means 20% of the data will be used for testing and 80% will be used for training 
#randomstate = 42- it makes the datset reproducible meaning if you run it multiple times you will get the same split.
# x_test = features for testing 20% of the data, x_train = features for training 80% of the data
# y_test = target values for testing, y_train = target values for training
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state= 42 
)



In [None]:
import pandas as pd 
df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True)

#Split into Features and Target
x = df_encoded.drop('churn', axis=1) #it takes in all other columns and removes the churn column from the data set -think of it as everything that will help predict the churn apart from the actual churn 
y = df_encoded['churn'] #the target variable- the actual churn what i want the model to learn to predict using (x)



In [None]:
import pandas as pd
df = pd.read_csv('insurance_churn_data (1).csv')
categorical_cols = ['gender','region','marital_status','policy_type']#Machine learning models only understand numbers. So we need to convert categories into numeric form.
df_encoded = pd.get_dummies(df,categorical_cols,drop_first=True) #converting from text to numerical figures now all categories become 0s and 1s  p.s this is a one-hot encoding
#it also creates a new virtual dataframe where all figures are numeric 
df_encoded.head()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt 


df = pd.read_csv('insurance_churn_data (1).csv')
# Satisfaction vs Churn
sns.barplot(x='satisfaction_score', y='churn', data=df)
plt.show()

# Complaints vs Churn
sns.barplot(x='complaints_filed', y='churn', data=df)
plt.show()


In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.groupby('policy_type')['churn'].mean()


In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.groupby('policy_tenure')['churn'].mean()

In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.groupby('complaints_filed')['churn'].mean()


In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.groupby('satisfaction_score')['churn'].mean() #groups the customer satisfaction by churn output and shows the percentage of customer satisfaction by churn

In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.columns

In [None]:
import matplotlib.pyplot as plt 
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')


df['churn'].value_counts().plot(kind='bar', color=['skyblue', 'orange'])#looks at the churn column counts how many customers there are i.e 1,0 and also creates the bar chart
plt.title('Churn Distribution')
plt.xlabel('churn')
plt.ylabel('Number of customers')
plt.show()

In [None]:
import pandas as pd 
df = pd.read_csv('insurance_churn_data (1).csv')

df['churn'] = df['churn'].astype(int)  

In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')



df['churn'].value_counts() #counts how many customers churned 
df['churn'].value_counts(normalize=True) * 100 # normalize=true converts it to percentage because pandas sees it as a proportion

In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')

df.isnull().sum() #used to check if any columns have empty values 

In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.shape # returns the total number of rows and columns

In [None]:
import pandas as pd

df = pd.read_csv('insurance_churn_data (1).csv')
df.head()   # returns the first 5 columns and rows
df.describe() #Gives a statistical summary of the columns 

In [None]:
import pandas as pd
import matplotlib as plt
import numpy as np 