In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Step 1: Data Preprocessing
# Load the dataset
data = pd.read_excel('customer_churn_large_dataset.xlsx')

# Handle missing data (e.g., remove or impute missing values)
data.dropna(inplace=True)

# Handle outliers (you can use techniques like IQR or Z-score)
Q1 = data['Monthly_Bill'].quantile(0.25)
Q3 = data['Monthly_Bill'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['Monthly_Bill'] >= Q1 - 1.5 * IQR) & (data['Monthly_Bill'] <= Q3 + 1.5 * IQR)]

# Encode categorical variables (e.g., one-hot encoding for 'Gender' and 'Location')
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)

# Split the data into training and testing sets
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop the 'CustomerID' and 'Name' column as it's not a relevant feature for prediction
X_train = X_train.drop('CustomerID', axis=1)
X_test = X_test.drop('CustomerID', axis=1)
X_train = X_train.drop('Name', axis=1)
X_test = X_test.drop('Name', axis=1)

# Step 2: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Model Building (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Step 4: Model Optimization (Random Forest) - Example
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_scaled, y_train)
best_rf_model = grid_search_rf.best_estimator_

# Step 5: Model Building (Logistic Regression)
logistic_model = LogisticRegression()
logistic_model.fit(X_train_scaled, y_train)

# Step 6: Model Optimization (Logistic Regression) - Example
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs', 'saga']
}

grid_search_logistic = GridSearchCV(LogisticRegression(), param_grid_logistic, cv=5, scoring='accuracy')
grid_search_logistic.fit(X_train_scaled, y_train)
best_logistic_model = grid_search_logistic.best_estimator_

# Step 7: Model Evaluation
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    return accuracy, precision, recall, f1

# Evaluate Random Forest Model
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(best_rf_model, X_test_scaled, y_test)

# Evaluate Logistic Regression Model
logistic_accuracy, logistic_precision, logistic_recall, logistic_f1 = evaluate_model(best_logistic_model, X_test_scaled, y_test)

# Print evaluation metrics for both models
print("Random Forest Model Metrics:")
print(f"Accuracy: {rf_accuracy:.2f}")
print(f"Precision: {rf_precision:.2f}")
print(f"Recall: {rf_recall:.2f}")
print(f"F1-Score: {rf_f1:.2f}")

print("\nLogistic Regression Model Metrics:")
print(f"Accuracy: {logistic_accuracy:.2f}")
print(f"Precision: {logistic_precision:.2f}")
print(f"Recall: {logistic_recall:.2f}")
print(f"F1-Score: {logistic_f1:.2f}")

# Step 8: Save the Best Model
# Save the best model to a file (you can choose either the Random Forest or Logistic Regression model)
joblib.dump(best_rf_model, 'churn_model.pkl')
# joblib.dump(best_logistic_model, 'churn_model.pkl')


In [None]:
# Load the model in a production-like environment (simulated deployment)
loaded_model = joblib.load('churn_model.pkl')


# Make predictions on new data
new_data = pd.DataFrame({
    'Age': [30],
    'Subscription_Length_Months': [12],
    'Monthly_Bill': [50],
    'Total_Usage_GB': [100],
    'Gender_Male': [0],
    'Location_Houston': [0],  # Include all one-hot encoded location columns with appropriate values
    'Location_Los Angeles': [0],
    'Location_Miami': [0],
    'Location_New York': [0],  # Set the relevant location to 1 if needed

})
prediction = loaded_model.predict(new_data)

# Print the churn prediction
if prediction[0] == 1:
    print("This customer is likely to churn.")
else:
    print("This customer is likely to stay.")


# Evaluate the model's performance on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

This customer is likely to stay.
Accuracy: 0.50
Precision: 0.49
Recall: 0.47
F1-Score: 0.48
