In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib  # For saving the model

# Step 2: Load the dataset
# Replace 'customer_churn.csv' with your dataset file
data = pd.read_csv('/content/Churn_Modelling.csv')

# Step 3: Basic data inspection
print("Dataset Information:")
print(data.info())

print("\nSummary Statistics:")
print(data.describe())

print("\nSample Data:")
print(data.head())

# Check the column names to confirm the target column is 'Churn'
print("\nColumn Names:")
print(data.columns)

# Step 4: Handle missing values
# For numeric columns, fill missing values with median
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# For categorical columns, fill missing values with mode (most frequent value)
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# Step 5: Data preprocessing
# Encode categorical variables
label_encoder = LabelEncoder()
for col in data.select_dtypes(include=['object']).columns:
    data[col] = label_encoder.fit_transform(data[col])

# Verify if 'Churn' column exists
target_column = 'Churn'  # Adjust this if needed based on actual column name
if target_column not in data.columns:
    print(f"Error: Column '{target_column}' not found. Available columns are: {data.columns}")
else:
    # Separate features (X) and target (y)
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Step 6: Train a model
    # Using RandomForestClassifier as an example
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Step 7: Evaluate the model
    # Predict and calculate metrics
    y_pred = model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # ROC-AUC Score
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC-AUC Score: {roc_auc}")

    # Plot ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

    # Step 8: Feature importance
    importances = model.feature_importances_
    feature_names = X.columns
    plt.figure(figsize=(10, 6))
    sns.barplot(x=importances, y=feature_names)
    plt.title("Feature Importance")
    plt.show()

    # Step 9: Hyperparameter Tuning (Optional)
    # Uncomment to run GridSearch for RandomForest hyperparameters
    # param_grid = {
    #     'n_estimators': [100, 200, 300],
    #     'max_depth': [None, 10, 20, 30],
    #     'min_samples_split': [2, 5, 10]
    # }
    # grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
    # grid_search.fit(X_train, y_train)
    # best_model = grid_search.best_estimator_
    # print("Best Parameters:", grid_search.best_params_)

    # Step 10: Save the model
    joblib.dump(model, 'churn_model.pkl')
    print("\nModel saved as churn_model.pkl")

    # Step 11: Load the saved model and make predictions (example usage)
    loaded_model = joblib.load('churn_model.pkl')
    sample_data = X_test[0].reshape(1, -1)  # Replace with your own sample data
    sample_prediction = loaded_model.predict(sample_data)
    print("\nSample Prediction:", sample_prediction)


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
None

Summary Statistics:
         RowNumber    CustomerId   CreditScore 