In [27]:
!pip install xgboost

import pandas as pd

# Raw data GitHub link
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

# Load the dataset
df = pd.read_csv(url)

# Preview the data
df.head()

# Check shape
print("Dataset shape:", df.shape)

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# See data types and basic info
df.info()

# How many customers churned vs stayed
print("\nChurn distribution:")
print(df['Churn'].value_counts())

# Remove spaces and convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check if there are now any missing values
print("Missing values in TotalCharges:", df['TotalCharges'].isnull().sum())

# Drop rows with missing TotalCharges
df.dropna(subset=['TotalCharges'], inplace=True)

# Confirm new shape
print("New dataset shape:", df.shape)

# Drop customerID — it's just an identifier
df.drop('customerID', axis=1, inplace=True)

# Convert target 'Churn' to binary: Yes → 1, No → 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Use one-hot encoding for all categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

# Show shape after encoding
print("Shape after encoding:", df_encoded.shape)

from sklearn.model_selection import train_test_split

# Split features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Train-test split (stratified to preserve churn distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))


from sklearn.ensemble import RandomForestClassifier

# Initialize and train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_rf))

import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Create XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_xgb))


readme_text = """
# Customer Churn Prediction

This project uses machine learning to predict whether a customer is likely to churn based on their account details and service usage. The goal is to help businesses retain customers by identifying churn risk early.

---

## Dataset
- Source: [IBM Telco Customer Churn Dataset](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)
- Size: 7,032 rows × 21 columns
- Target: `Churn` (Yes/No)

---

##  Objective
To build and compare classification models that can predict customer churn with good accuracy, recall, and AUC score.

---

## Tools & Libraries
- Python
- pandas, numpy
- matplotlib, seaborn
- scikit-learn
- xgboost

---

## Models Tested

| Model                | Accuracy | Recall (Churn) | F1 (Churn) | ROC-AUC |
|---------------------|----------|----------------|------------|---------|
| Logistic Regression | 80%      | **57%**        | **61%**    | **0.836** ✅ |
| Random Forest        | 79%      | 52%            | 57%        | 0.816   |
| XGBoost              | 77%      | 52%            | 54%        | 0.814   |

 **Logistic Regression** performed best overall.

---

## Results Summary
- Churn rate: ~26%
- Most important features: `Contract`, `Tenure`, `MonthlyCharges`, `PaymentMethod`
- Final model achieves **balanced recall and precision**, with strong ROC-AUC

---

## Key Skills Demonstrated
- Data cleaning and feature engineering
- Classification modeling
- Handling imbalanced datasets
- Evaluation metrics (Confusion Matrix, ROC-AUC, F1)
- Business application of data science

---

## How to Run

```bash
pip install -r requirements.txt
## Author

Vanessa Chinhengo
"""

# Save it as a file
with open("README.md", "w") as f:
    f.write(readme_text)

# Download it
from google.colab import files
files.download("README.md")

#create requirements.txt
requirements = """

"""

#save it as a file
with open("requirements.txt", "w") as f:
    f.write("pandas\nnumpy\nscikit-learn\nmatplotlib\nseaborn\nxgboost")

# Download the file
from google.colab import files
files.download("requirements.txt")

Dataset shape: (7043, 21)

Missing values per column:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-nul

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[916 117]
 [160 214]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407

ROC-AUC Score: 0.8363897790040947
Confusion Matrix:
 [[917 116]
 [180 194]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

ROC-AUC Score: 0.8164903116927489


Parameters: { "use_label_encoder" } are not used.



Confusion Matrix:
 [[886 147]
 [181 193]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84      1033
           1       0.57      0.52      0.54       374

    accuracy                           0.77      1407
   macro avg       0.70      0.69      0.69      1407
weighted avg       0.76      0.77      0.76      1407

ROC-AUC Score: 0.8141801823255044


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>