# CUSTOMER CHURN PREDICTION
## Develop a model to predict customer churn for a subscriptionbased service or business. Use historical customer data, includingfeatures like usage behavior and customer demographics, and tryalgorithms like Logistic Regression, Random Forests, or GradientBoosting to predict churn.

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# Define the target variable
target_variable = 'Exited'

# Load the customer churn dataset
churn_data = pd.read_csv("C:\\Users\\Jenisha Rebello\\Downloads\\Churn_Modelling.csv")

# Explore the initial records of the dataset
print("Initial records of the dataset:")
print(churn_data.head())

# Data Preprocessing
churn_data = churn_data.drop(['RowNumber', 'CustomerId'], axis=1)  # Remove irrelevant columns

# Convert categorical variables to numerical using one-hot encoding
churn_data = pd.get_dummies(churn_data, columns=['Geography', 'Gender'], drop_first=True)

# Drop 'Surname' column for simplicity
churn_data = churn_data.drop(['Surname'], axis=1)

# Split the data into features (X) and the target variable (y)
X = churn_data.drop(target_variable, axis=1)
y = churn_data[target_variable]

# Handle imbalanced data using Synthetic Minority Over-sampling Technique (SMOTE)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gradient Boosting Classifier
gradient_boost_model = GradientBoostingClassifier(random_state=42)
gradient_boost_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}
grid_search = GridSearchCV(estimator=gradient_boost_model, param_grid=gradient_boost_params, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_scaled)

# Display results for the best model
print("Best Gradient Boosting Model:")
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:", confusion_matrix(y_test, predictions))
print("Classification Report:", classification_report(y_test, predictions))


Initial records of the dataset:
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         