In [1]:
# Install the necessary library
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install fpdf
install("fpdf")

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from fpdf import FPDF

# Create project directories
os.makedirs('project_18_churn_analysis/data', exist_ok=True)
os.makedirs('project_18_churn_analysis/reports', exist_ok=True)
os.makedirs('project_18_churn_analysis/models', exist_ok=True)
os.makedirs('project_18_churn_analysis/visualizations', exist_ok=True)

# 1. Simulate churn data
np.random.seed(42)
n_clients = 1000

data = pd.DataFrame({
    'CustomerID': range(1, n_clients + 1),
    'Tenure': np.random.randint(1, 24, n_clients),
    'MonthlyCharges': np.random.uniform(10, 100, n_clients),
    'TotalCharges': np.random.uniform(50, 2000, n_clients),
    'SupportInteractions': np.random.randint(0, 10, n_clients),
    'PaymentHistoryIssues': np.random.randint(0, 5, n_clients),
    'Churn': np.random.choice([0, 1], n_clients, p=[0.8, 0.2])
})

# Save data as CSV
data.to_csv('project_18_churn_analysis/data/churn_analysis_data.csv', index=False)

# 2. Split data into training and testing sets
X = data.drop(['CustomerID', 'Churn'], axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Train RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'project_18_churn_analysis/models/churn_model.pkl')

# 4. Make predictions and evaluate the model
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

classification_rep = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# 5. Save classification report as CSV
classification_df = pd.DataFrame(classification_rep).transpose()
classification_df.to_csv('project_18_churn_analysis/reports/classification_report.csv', index=True)

# 6. Generate visualizations
# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Churn Prediction')
plt.tight_layout()
plt.savefig('project_18_churn_analysis/visualizations/confusion_matrix.png')
plt.close()

# ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC Curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('project_18_churn_analysis/visualizations/roc_curve.png')
plt.close()

# 7. Create README.md
readme_content = (
    "# Project 18: Churn Analysis in a Subscription-Based Company\n\n"
    "## Project Overview\n"
    "This project simulates a churn analysis for a subscription-based service company. The goal is to identify factors "
    "leading to customer churn and develop predictive models to forecast potential churn, enabling the company to take proactive retention measures.\n\n"
    "## Objectives\n"
    "- Data collection and preprocessing to simulate real-world churn data.\n"
    "- Exploratory Data Analysis (EDA) to find patterns related to churn.\n"
    "- Development of machine learning models to predict churn.\n"
    "- Visualization of churn trends, customer segmentation, and risk factors.\n\n"
    "## Methods\n"
    "- Data Cleaning: Handling missing values and outliers.\n"
    "- Exploratory Data Analysis (EDA): Understanding customer behavior and churn patterns.\n"
    "- Machine Learning Models: Random Forest was used for churn prediction.\n"
    "- Visualizations: Confusion matrix, ROC curve, and classification report for model evaluation.\n\n"
    "## How to Run\n"
    "1. Install the required packages:\n"
    "   ```\n"
    "   pip install pandas numpy scikit-learn matplotlib seaborn joblib fpdf\n"
    "   ```\n"
    "2. Run the Jupyter Notebook `churn_analysis.ipynb` to follow the full workflow.\n"
    "3. Alternatively, execute the Python script `churn_analysis.py` for a complete run of the project.\n\n"
    "## Results\n"
    "The Random Forest model achieved satisfactory performance in predicting churn, identifying key factors contributing to churn risk.\n\n"
    "## Folders\n"
    "- **data**: Contains the dataset used for analysis.\n"
    "- **reports**: Contains the analysis report and visualizations.\n"
    "- **models**: Stores the trained model.\n"
    "- **visualizations**: Contains images of the visualizations generated.\n"
)
with open('project_18_churn_analysis/README.md', 'w') as file:
    file.write(readme_content)

# 8. Create PDF Report
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

# Introduction
pdf.set_font("Arial", style='B', size=14)
pdf.cell(200, 10, 'Churn Analysis in a Subscription-Based Company', ln=True, align='C')
pdf.set_font("Arial", size=12)
pdf.ln(10)
pdf.multi_cell(0, 10, "This report presents an analysis of customer churn in a subscription-based company...")
pdf.ln(5)

# Methodology
pdf.set_font("Arial", style='B', size=14)
pdf.cell(200, 10, 'Methodology', ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, "The analysis includes data preprocessing, exploratory data analysis, and model training...")

# Results and Conclusions
pdf.set_font("Arial", style='B', size=14)
pdf.cell(200, 10, 'Results and Conclusions', ln=True)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, f"Model achieved an ROC-AUC score of {roc_auc:.2f}...")

pdf.output('project_18_churn_analysis/reports/churn_analysis_report.pdf')


''

In [2]:
import shutil
from google.colab import files

shutil.make_archive("18", 'zip', ".")

files.download("18.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>