# Complete Data Science Project

## Learning Objectives
By the end of this lesson, you will be able to:
- Execute a complete data science project
- Apply all skills learned: data cleaning, analysis, visualization
- Build a simple prediction model
- Present findings and recommendations
- Follow a structured project workflow

## Core Concepts
- **Project Workflow**: Problem ‚Üí Data ‚Üí Analysis ‚Üí Model ‚Üí Insights
- **Business Problem**: Real-world question to solve with data
- **Model**: Algorithm that makes predictions
- **Validation**: Checking if our model works well
- **Recommendations**: Actionable steps based on findings

## 1. Project Setup and Data Preparation

In [None]:
# PROJECT: Predicting Customer Churn (Will customers leave?)
# Business goal: Identify customers likely to cancel so we can keep them

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print("üéØ PROJECT GOAL: Predict which customers will leave")
print("üìä BUSINESS VALUE: Save money by keeping customers")

# Create realistic customer data
np.random.seed(42)
n_customers = 1000

customers = pd.DataFrame({
    'customer_id': range(1, n_customers + 1),
    'tenure_months': np.random.randint(1, 48, n_customers),
    'monthly_charges': np.random.normal(65, 20, n_customers).clip(20, 150),
    'total_charges': np.random.normal(1500, 800, n_customers).clip(100, 5000),
    'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2]),
    'payment_method': np.random.choice(['Credit card', 'Bank transfer', 'Electronic check'], n_customers),
    'internet_service': np.random.choice(['DSL', 'Fiber', 'No'], n_customers, p=[0.4, 0.5, 0.1]),
    'tech_support': np.random.choice(['Yes', 'No'], n_customers, p=[0.3, 0.7]),
    'senior_citizen': np.random.choice([0, 1], n_customers, p=[0.8, 0.2]),
    'monthly_calls': np.random.randint(0, 10, n_customers)
})

# Create realistic churn based on business logic
def calculate_churn(row):
    churn_prob = 0.1  # Base probability
    
    # Short tenure = higher churn
    if row['tenure_months'] < 6:
        churn_prob += 0.4
    elif row['tenure_months'] < 12:
        churn_prob += 0.2
    
    # Month-to-month contracts = higher churn
    if row['contract_type'] == 'Month-to-month':
        churn_prob += 0.3
    
    # High charges = higher churn
    if row['monthly_charges'] > 80:
        churn_prob += 0.15
    
    # No tech support = higher churn
    if row['tech_support'] == 'No':
        churn_prob += 0.1
    
    # Lots of calls to support = higher churn
    if row['monthly_calls'] > 5:
        churn_prob += 0.15
    
    return min(churn_prob, 0.8)  # Cap at 80%

customers['churn_probability'] = customers.apply(calculate_churn, axis=1)
customers['churned'] = np.random.binomial(1, customers['churn_probability'], n_customers)

print(f"\nüìà DATASET CREATED:")
print(f"Customers: {len(customers):,}")
print(f"Churn rate: {customers['churned'].mean():.1%}")
print(f"Features: {customers.shape[1] - 2}")  # Exclude ID and probability

# Initial data exploration
print(f"\nüîç FIRST LOOK AT DATA:")
print(customers.head())

print(f"\nMissing values: {customers.isnull().sum().sum()}")
print(f"Data types: {customers.dtypes.value_counts().to_dict()}")

# Quick churn analysis
print(f"\nüìä CHURN BY KEY FACTORS:")
churn_by_contract = customers.groupby('contract_type')['churned'].mean()
print(f"Churn by contract:")
for contract, rate in churn_by_contract.items():
    print(f"  {contract}: {rate:.1%}")

churn_by_support = customers.groupby('tech_support')['churned'].mean()
print(f"\nChurn by tech support:")
for support, rate in churn_by_support.items():
    print(f"  {support}: {rate:.1%}")

# Visualize key patterns
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Churn by tenure
tenure_bins = pd.cut(customers['tenure_months'], bins=5)
churn_by_tenure = customers.groupby(tenure_bins)['churned'].mean()
churn_by_tenure.plot(kind='bar', ax=axes[0,0], color='orange')
axes[0,0].set_title('Churn Rate by Tenure')
axes[0,0].set_xlabel('Tenure (months)')
axes[0,0].tick_params(axis='x', rotation=45)

# Charges distribution
churned = customers[customers['churned'] == 1]['monthly_charges']
stayed = customers[customers['churned'] == 0]['monthly_charges']
axes[0,1].hist([stayed, churned], bins=20, alpha=0.7, label=['Stayed', 'Churned'], color=['green', 'red'])
axes[0,1].set_title('Monthly Charges Distribution')
axes[0,1].legend()

# Contract type impact
contract_churn = customers.groupby('contract_type')['churned'].mean()
axes[1,0].bar(contract_churn.index, contract_churn.values, color='lightblue')
axes[1,0].set_title('Churn Rate by Contract Type')
axes[1,0].tick_params(axis='x', rotation=45)

# Support calls vs churn
axes[1,1].scatter(customers['monthly_calls'], customers['churned'], alpha=0.6)
axes[1,1].set_title('Support Calls vs Churn')
axes[1,1].set_xlabel('Monthly Support Calls')
axes[1,1].set_ylabel('Churned (1=Yes, 0=No)')

plt.tight_layout()
plt.show()

print("‚úÖ Data preparation complete - patterns are emerging!")

## 2. Building and Testing Our Model

In [None]:
# Prepare data for machine learning model
print("ü§ñ BUILDING PREDICTION MODEL")

# Convert text to numbers (models need numbers)
model_data = customers.copy()

# Convert categorical variables to numbers
model_data['contract_month_to_month'] = (model_data['contract_type'] == 'Month-to-month').astype(int)
model_data['contract_one_year'] = (model_data['contract_type'] == 'One year').astype(int)
model_data['payment_electronic'] = (model_data['payment_method'] == 'Electronic check').astype(int)
model_data['internet_fiber'] = (model_data['internet_service'] == 'Fiber').astype(int)
model_data['has_tech_support'] = (model_data['tech_support'] == 'Yes').astype(int)

# Select features for our model
features = [
    'tenure_months', 'monthly_charges', 'total_charges', 
    'contract_month_to_month', 'contract_one_year',
    'payment_electronic', 'internet_fiber', 'has_tech_support',
    'senior_citizen', 'monthly_calls'
]

X = model_data[features]  # Input features
y = model_data['churned']  # What we want to predict

print(f"Features used: {len(features)}")
print(f"Training samples: {len(X)}")

# Split data: train model on some, test on others
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {len(X_train)} customers")
print(f"Test set: {len(X_test)} customers")

# Build the model (Random Forest - good for beginners)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Test the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"\nüéØ MODEL PERFORMANCE:")
print(f"Accuracy: {accuracy:.1%}")
print("(How often our model is correct)")

# Detailed performance
print(f"\nDetailed results:")
print(classification_report(y_test, predictions, target_names=['Will Stay', 'Will Churn']))

# Feature importance - what matters most?
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nüîë MOST IMPORTANT FACTORS:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"{row['feature']}: {row['importance']:.3f}")

# Visualize model performance
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Feature importance
top_features = feature_importance.head(8)
axes[0].barh(range(len(top_features)), top_features['importance'], color='skyblue')
axes[0].set_yticks(range(len(top_features)))
axes[0].set_yticklabels(top_features['feature'])
axes[0].set_title('Most Important Factors')
axes[0].invert_yaxis()

# Prediction accuracy
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['Will Stay', 'Will Churn'],
            yticklabels=['Actually Stayed', 'Actually Churned'])
axes[1].set_title('Model Accuracy')

# Risk distribution
prediction_probs = model.predict_proba(X_test)[:, 1]  # Probability of churn
risk_levels = pd.cut(prediction_probs, bins=[0, 0.3, 0.7, 1.0], labels=['Low Risk', 'Medium Risk', 'High Risk'])
risk_counts = risk_levels.value_counts()
axes[2].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', colors=['green', 'orange', 'red'])
axes[2].set_title('Customer Risk Distribution')

plt.tight_layout()
plt.show()

# Test on some example customers
print(f"\nüîÆ EXAMPLE PREDICTIONS:")
sample_customers = X_test.head(5)
sample_predictions = model.predict(sample_customers)
sample_probs = model.predict_proba(sample_customers)[:, 1]

for i in range(5):
    print(f"Customer {i+1}: {sample_predictions[i]} ({'Will Churn' if sample_predictions[i] else 'Will Stay'}) - {sample_probs[i]:.1%} risk")

print("‚úÖ Model built and tested - ready for business insights!")

## 3. Business Insights and Recommendations

In [None]:
# Apply model to all customers for business insights
print("? BUSINESS INSIGHTS AND ACTION PLAN")

# Get risk scores for all customers
all_predictions = model.predict_proba(X)[:, 1]
customers['churn_risk_score'] = all_predictions

# Create risk categories
customers['risk_category'] = pd.cut(customers['churn_risk_score'], 
                                   bins=[0, 0.3, 0.7, 1.0], 
                                   labels=['Low Risk', 'Medium Risk', 'High Risk'])

# Analyze customer segments
risk_summary = customers.groupby('risk_category').agg({
    'customer_id': 'count',
    'monthly_charges': 'mean',
    'tenure_months': 'mean',
    'churn_risk_score': 'mean'
}).round(2)

print(f"üìä CUSTOMER RISK SEGMENTS:")
print(risk_summary)

# Focus on high-risk customers
high_risk_customers = customers[customers['risk_category'] == 'High Risk']
print(f"\nüö® HIGH-RISK CUSTOMERS: {len(high_risk_customers)} customers")
print(f"Potential monthly revenue loss: ${high_risk_customers['monthly_charges'].sum():,.0f}")

# Analyze what makes customers high-risk
print(f"\nHigh-risk customer profile:")
print(f"Average tenure: {high_risk_customers['tenure_months'].mean():.1f} months")
print(f"Average monthly charge: ${high_risk_customers['monthly_charges'].mean():.0f}")
print(f"Month-to-month contracts: {(high_risk_customers['contract_type'] == 'Month-to-month').mean():.1%}")
print(f"No tech support: {(high_risk_customers['tech_support'] == 'No').mean():.1%}")

# Calculate business impact
total_customers = len(customers)
current_churn_rate = customers['churned'].mean()
potential_lost_revenue = high_risk_customers['monthly_charges'].sum() * 12  # Annual

print(f"\n? BUSINESS IMPACT:")
print(f"Total customers: {total_customers:,}")
print(f"Current churn rate: {current_churn_rate:.1%}")
print(f"High-risk customers: {len(high_risk_customers)} ({len(high_risk_customers)/total_customers:.1%})")
print(f"Potential annual revenue at risk: ${potential_lost_revenue:,.0f}")

# Specific recommendations for different risk groups
print(f"\nüéØ ACTION PLAN BY RISK LEVEL:")

print(f"\n1. HIGH RISK ({len(high_risk_customers)} customers):")
print(f"   ‚Ä¢ Immediate personal outreach")
print(f"   ‚Ä¢ Offer contract upgrades with discounts")
print(f"   ‚Ä¢ Provide free tech support")
print(f"   ‚Ä¢ Priority customer service")

medium_risk = customers[customers['risk_category'] == 'Medium Risk']
print(f"\n2. MEDIUM RISK ({len(medium_risk)} customers):")
print(f"   ‚Ä¢ Proactive engagement programs")
print(f"   ‚Ä¢ Service improvement initiatives")
print(f"   ‚Ä¢ Loyalty rewards program")

low_risk = customers[customers['risk_category'] == 'Low Risk']
print(f"\n3. LOW RISK ({len(low_risk)} customers):")
print(f"   ‚Ä¢ Referral programs")
print(f"   ‚Ä¢ Upselling opportunities")
print(f"   ‚Ä¢ Maintain current service level")

# Create business dashboard
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Risk distribution
risk_counts = customers['risk_category'].value_counts()
colors = ['green', 'orange', 'red']
axes[0,0].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', colors=colors)
axes[0,0].set_title('Customer Risk Distribution')

# Revenue at risk by segment
revenue_by_risk = customers.groupby('risk_category')['monthly_charges'].sum()
axes[0,1].bar(revenue_by_risk.index, revenue_by_risk.values, color=colors)
axes[0,1].set_title('Monthly Revenue by Risk Level')
axes[0,1].set_ylabel('Monthly Revenue ($)')

# Tenure vs risk score
axes[1,0].scatter(customers['tenure_months'], customers['churn_risk_score'], alpha=0.6)
axes[1,0].set_xlabel('Tenure (months)')
axes[1,0].set_ylabel('Churn Risk Score')
axes[1,0].set_title('Tenure vs Churn Risk')

# Top factors importance (from our model)
top_5_features = feature_importance.head(5)
axes[1,1].barh(range(len(top_5_features)), top_5_features['importance'], color='lightblue')
axes[1,1].set_yticks(range(len(top_5_features)))
axes[1,1].set_yticklabels(top_5_features['feature'])
axes[1,1].set_title('Top 5 Churn Factors')
axes[1,1].invert_yaxis()

plt.tight_layout()
plt.show()

# Next steps
print(f"\nüìã IMPLEMENTATION TIMELINE:")
print(f"Week 1: Contact all high-risk customers")
print(f"Week 2: Launch retention campaigns")
print(f"Week 3: Monitor early results")
print(f"Month 2: Expand to medium-risk customers")
print(f"Month 3: Evaluate program success")

print(f"\nüèÜ PROJECT COMPLETE!")
print(f"‚úÖ Built a {accuracy:.1%} accurate churn prediction model")
print(f"‚úÖ Identified {len(high_risk_customers)} high-risk customers")
print(f"‚úÖ Created actionable business recommendations")
print(f"‚úÖ Potential to save ${potential_lost_revenue:,.0f} in annual revenue")

# Simple practice exercises
print(f"\n\nüìö PRACTICE EXERCISES:")
print(f"1. Try changing the risk thresholds (0.3, 0.7) to see how it affects segments")
print(f"2. Add new features like 'years_as_customer' and retrain the model") 
print(f"3. Calculate ROI if retention campaigns cost $50 per customer")
print(f"4. Create a monthly monitoring dashboard")
print(f"5. Build a similar model for a different business problem")