<a href="https://colab.research.google.com/github/MohamadHusseinIsmail/Data-science-portfolio/blob/main/churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
churn_data = pd.read_csv('/content/Churn_Modelling.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop unnecessary columns
churn_data_cleaned = churn_data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Encode categorical features
label_encoder = LabelEncoder()
churn_data_cleaned['Gender'] = label_encoder.fit_transform(churn_data_cleaned['Gender'])
churn_data_cleaned = pd.get_dummies(churn_data_cleaned, columns=['Geography'], drop_first=True)

# Separate features and target
X = churn_data_cleaned.drop('Exited', axis=1)
y = churn_data_cleaned['Exited']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize models
logistic_model = LogisticRegression(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
gradient_boosting_model = GradientBoostingClassifier(random_state=42)

# Logistic Regression
logistic_model.fit(X_train, y_train)
logistic_preds = logistic_model.predict(X_test)

# Random Forest
random_forest_model.fit(X_train, y_train)
rf_preds = random_forest_model.predict(X_test)

# Gradient Boosting
gradient_boosting_model.fit(X_train, y_train)
gb_preds = gradient_boosting_model.predict(X_test)

In [None]:
# Accuracy and classification reports for each model
print("Logistic Regression Accuracy:", accuracy_score(y_test, logistic_preds))
print("Classification Report:\n", classification_report(y_test, logistic_preds))

print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("Classification Report:\n", classification_report(y_test, rf_preds))

print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_preds))
print("Classification Report:\n", classification_report(y_test, gb_preds))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")

# Churn Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Exited', data=churn_data_cleaned, palette='viridis')
plt.title("Churn Distribution")
plt.xlabel("Churn (Exited)")
plt.ylabel("Count")
plt.xticks([0, 1], ['Retained', 'Churned'])
plt.show()


In [None]:
# Train the Random Forest model to extract feature importance
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

# Extract and plot feature importances
feature_importances = pd.Series(random_forest_model.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_importances.index, palette='viridis')
plt.title("Feature Importance - Random Forest")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()



In [None]:
# Churn by Age
plt.figure(figsize=(10, 6))
sns.histplot(data=churn_data_cleaned, x='Age', hue='Exited', multiple="stack", palette="viridis", bins=30)
plt.title("Churn by Age")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()


In [None]:
# Churn by Balance
plt.figure(figsize=(10, 6))
sns.histplot(data=churn_data_cleaned, x='Balance', hue='Exited', multiple="stack", palette="viridis", bins=30)
plt.title("Churn by Balance")
plt.xlabel("Balance")
plt.ylabel("Count")
plt.show()



In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)


In [None]:
# Create a LoyaltyScore feature as an example
churn_data_cleaned['LoyaltyScore'] = churn_data_cleaned['Tenure'] * churn_data_cleaned['IsActiveMember']


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(churn_data_cleaned.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_sm, y_train_sm)

# Make predictions
xgb_preds = xgb_model.predict(X_test)
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10]
}

# Initialize model and GridSearch
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train_sm, y_train_sm)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)



In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# Predict probabilities for ROC-AUC
rf_probs = grid_search.best_estimator_.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, rf_probs)

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, rf_probs)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-AUC Curve")
plt.legend()
plt.show()




In [None]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
churn_data_cleaned['Cluster'] = kmeans.fit_predict(X)

# Plot clusters by age and balance for visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Balance', hue='Cluster', data=churn_data_cleaned, palette="viridis")
plt.title("Customer Segmentation by Age and Balance")
plt.xlabel("Age")
plt.ylabel("Balance")
plt.legend(title="Cluster")
plt.show()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define pipeline steps
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate
pipeline_preds = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pipeline_preds))
