## 1. Import Libraries

Import all necessary libraries for data analysis, visualization, and modeling.

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style('whitegrid')
%matplotlib inline

## 2. Load Data

Load the customer churn dataset from the raw data folder.

In [None]:
# Load dataset
df = pd.read_csv('../data/raw/dataset1.csv')

# Display first few rows
print(f"Dataset shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Basic Information

Examine the structure and basic statistics of the dataset.

In [None]:
# Display data info
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

### 3.2 Missing Values

Check for missing values in the dataset.

In [None]:
# TODO: Check for missing values and visualize them
# Hint: Use df.isnull().sum() and consider creating a visualization

### 3.3 Target Variable Distribution

Analyze the distribution of the target variable (churn).

In [None]:
# TODO: Analyze the churn distribution
# - Check value counts and percentages
# - Create a visualization (bar plot or pie chart)

### 3.4 Feature Exploration

Create visualizations to explore relationships between features and churn.

In [None]:
# TODO: Create exploratory visualizations
# Ideas:
# - Distribution of numerical features (histograms, box plots)
# - Churn rate by categorical features (contract type, payment method, etc.)
# - Correlation heatmap for numerical features
# - Scatter plots or pair plots for key features

# Example: Churn rate by contract type
# churn_by_contract = df.groupby('contract_type')['churn'].mean()
# plt.figure(figsize=(10, 6))
# churn_by_contract.plot(kind='bar')
# plt.title('Churn Rate by Contract Type')
# plt.ylabel('Churn Rate')
# plt.show()

## 4. Data Cleaning and Preprocessing

### 4.1 Handle Missing Values

Decide how to handle missing values. Options include:
- Drop rows with missing values (if few)
- Impute with mean/median for numerical features
- Impute with mode for categorical features
- Use more advanced imputation techniques

In [None]:
# TODO: Handle missing values
# Your preprocessing logic here

### 4.2 Handle Duplicates

Check for and remove duplicate rows.

In [None]:
# TODO: Check for duplicates and remove if necessary
# Hint: df.duplicated().sum() and df.drop_duplicates()

### 4.3 Handle Categorical Variables

Notice that some categorical variables have inconsistent values (e.g., 'Male' vs 'male').
Also, you need to encode categorical variables for modeling.

In [None]:
# TODO: Standardize categorical values and encode them
# Ideas:
# - Convert to lowercase or standardize values
# - Use Label Encoding or One-Hot Encoding
# - Consider which encoding method is appropriate for each feature

### 4.4 Handle Outliers

Identify and decide how to handle outliers in numerical features.

In [None]:
# TODO: Identify outliers (box plots, z-scores, IQR method)
# Decide whether to remove, cap, or transform them

### 4.5 Feature Scaling

Consider scaling numerical features, especially if using distance-based algorithms.

In [None]:
# TODO: Scale numerical features if needed
# Hint: StandardScaler, MinMaxScaler, or RobustScaler

## 5. Train/Validation Split

Split the data into training and validation sets.

In [None]:
# TODO: Prepare features (X) and target (y)
# Remove non-predictive columns (e.g., customer_id)

# TODO: Split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 6. Model Training

Train at least one classification model. Consider trying:
- Logistic Regression (baseline)
- Random Forest Classifier
- Gradient Boosting Classifier
- XGBoost (if installed)

In [None]:
# TODO: Import and train your chosen model(s)
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression

# Example:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

## 7. Model Evaluation

Evaluate your model using appropriate metrics for classification:
- Accuracy
- Precision, Recall, F1-Score
- Confusion Matrix
- ROC AUC Score and ROC Curve

In [None]:
# TODO: Make predictions on the test set
# y_pred = model.predict(X_test)
# y_pred_proba = model.predict_proba(X_test)[:, 1]  # For ROC curve

In [None]:
# TODO: Calculate and display evaluation metrics
# print(classification_report(y_test, y_pred))
# print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

## 8. Visualizations

Create visualizations to communicate your findings and model performance.

### 8.1 EDA Visualization

Create at least one compelling visualization from your exploratory analysis.

In [None]:
# TODO: Create an EDA visualization
# Save it to ../figures/dataset1/eda_plot.png

# Example:
# fig, ax = plt.subplots(figsize=(12, 6))
# # Your visualization code here
# plt.tight_layout()
# plt.savefig('../figures/dataset1/eda_plot.png', dpi=300, bbox_inches='tight')
# plt.show()

### 8.2 Model Performance Visualization

Create at least one visualization showing model performance (e.g., confusion matrix, ROC curve, feature importance).

In [None]:
# TODO: Create a model performance visualization
# Save it to ../figures/dataset1/model_performance.png

# Ideas:
# - Confusion Matrix
# - ROC Curve
# - Feature Importance (for tree-based models)

# Example:
# fig, ax = plt.subplots(figsize=(10, 8))
# # Your visualization code here
# plt.tight_layout()
# plt.savefig('../figures/dataset1/model_performance.png', dpi=300, bbox_inches='tight')
# plt.show()

## 9. Summary and Next Steps

Write a brief summary of:
- Key findings from EDA
- Model performance
- Potential improvements or next steps

**Your Summary Here:**

- Key findings: ...
- Model performance: ...
- Next steps: ...