In [1]:
# 📌 Import Required Libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 📂 Step 1: Check Current Directory for Data File
print("Files in current directory:")
print(os.listdir())

# 📥 Step 2: Load the Final Dataset
df = pd.read_csv("data_for_predictions.csv")

# 🧹 Step 3: Drop Irrelevant Columns
# ID column is typically unique to each row and not useful for prediction
if 'id' in df.columns:
    df = df.drop('id', axis=1)

# 🎯 Step 4: Split Features and Target
X = df.drop("churn", axis=1)  # Features
y = df["churn"]               # Target label

# 🔄 Step 5: Encode Categorical Variables (if any)
X = pd.get_dummies(X)

# ✂️ Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 🌲 Step 7: Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 🔍 Step 8: Make Predictions
y_pred = model.predict(X_test)

# 📊 Step 9: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# 🧾 Step 10: Print Evaluation Metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")

# 🧠 Metric Justification (add this in a Markdown cell if using Jupyter Notebook):
"""
- Accuracy: Measures overall correct predictions.
- Precision: Important if we want to reduce false positives (predicting churn when they won't).
- Recall: Important if we want to catch as many churners as possible.
- F1 Score: A balanced measure of Precision and Recall, ideal when classes are imbalanced.
"""


Files in current directory:
['.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.ms-ad', '.vscode', '3D Objects', 'age.csv', 'AppData', 'Application Data', 'BigData.ipynb', 'clean_data_after_eda.csv', 'client_data (1).csv', 'Contacts', 'Cookies', 'data_for_predictions.csv', 'Desktop', 'Documents', 'Downloads', 'engineered_data.csv', 'Favorites', 'h.txt', 'heart.csv', 'heart_correlation_matrix.csv', 'heart_covariance_matrix.csc', 'heart_variance.csv', 'height-weight.csv', 'IntelGraphicsProfiles', 'iris_correlation.csv', 'iris_correlation_matrix.csv', 'iris_corvariance.csv', 'iris_covariance.csv', 'iris_covariance_matrix.csc', 'iris_variance.csv', 'Links', 'Local Settings', 'Microsoft', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{53b39e88-18c4-11ea-a811-000d3aa4692b}.TM.blf', 'NTUSER.DAT{53b39e88-18c4-11ea-a811-000d3aa4692b}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{53b39e88-18c4-11ea-a811-000d3aa4692b}.TM

"\n- Accuracy: Measures overall correct predictions.\n- Precision: Important if we want to reduce false positives (predicting churn when they won't).\n- Recall: Important if we want to catch as many churners as possible.\n- F1 Score: A balanced measure of Precision and Recall, ideal when classes are imbalanced.\n"