<a href="https://colab.research.google.com/github/aqsabrekhna/Machine-learning-Training/blob/main/decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

n = 300
df = pd.DataFrame({
    "MonthlyUsageHours": np.random.normal(60, 20, n).clip(5, 120).round(1),
    "MonthlyBillPKR": np.random.normal(2500, 800, n).clip(500, 6000).round(0),
    "Complaints": np.random.poisson(1.2, n).clip(0, 6),
    "TenureMonths": np.random.randint(1, 60, n),
    "ContractType": np.random.choice(["Monthly", "Yearly"], n, p=[0.7, 0.3])
})

# Create churn label (hidden rule + noise)
score = (
    0.04*(df["MonthlyBillPKR"] - 2500) +
    0.8*(df["Complaints"]) -
    0.03*(df["TenureMonths"]) +
    1.2*(df["ContractType"] == "Monthly").astype(int) -
    0.02*(df["MonthlyUsageHours"] - 60)
)

prob = 1 / (1 + np.exp(-score/10))
df["Churn"] = (np.random.rand(n) < prob).astype(int)  # 1=Yes, 0=No

df.head()



Unnamed: 0,MonthlyUsageHours,MonthlyBillPKR,Complaints,TenureMonths,ContractType,Churn
0,69.9,1837.0,0,47,Monthly,0
1,57.2,2052.0,3,43,Monthly,0
2,73.0,3098.0,1,50,Yearly,1
3,90.5,2988.0,1,23,Monthly,1
4,55.3,2483.0,2,44,Monthly,0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.drop("Churn", axis=1)
y = df["Churn"]

cat_cols = ["ContractType"]
num_cols = ["MonthlyUsageHours", "MonthlyBillPKR", "Complaints", "TenureMonths"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)



In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

tree_model = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", DecisionTreeClassifier(max_depth=4, random_state=42))
])

tree_model.fit(X_train, y_train)
pred_tree = tree_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, pred_tree))
print(confusion_matrix(y_test, pred_tree))
print(classification_report(y_test, pred_tree))


Decision Tree Accuracy: 0.88
[[36  3]
 [ 6 30]]
              precision    recall  f1-score   support

           0       0.86      0.92      0.89        39
           1       0.91      0.83      0.87        36

    accuracy                           0.88        75
   macro avg       0.88      0.88      0.88        75
weighted avg       0.88      0.88      0.88        75



In [4]:
clf = tree_model.named_steps["clf"]
feature_names = tree_model.named_steps["prep"].get_feature_names_out()

imp = pd.Series(clf.feature_importances_, index=feature_names).sort_values(ascending=False)
print("Top Important Features:\n", imp.head(8))


Top Important Features:
 num__MonthlyBillPKR         0.857158
num__MonthlyUsageHours      0.092685
num__TenureMonths           0.026623
num__Complaints             0.020834
cat__ContractType_Yearly    0.002700
dtype: float64


In [5]:
from sklearn.tree import _tree

def explain_tree_decision(pipeline, single_row):
    prep = pipeline.named_steps["prep"]
    clf = pipeline.named_steps["clf"]
    X_trans = prep.transform(single_row)

    node_indicator = clf.decision_path(X_trans)
    leave_id = clf.apply(X_trans)

    feature_names = prep.get_feature_names_out()

    print("Customer input:\n", single_row.to_string(index=False), "\n")
    print("Decision path (rules):")
    for node_id in node_indicator.indices:
        if leave_id[0] == node_id:
            continue

        feature = clf.tree_.feature[node_id]
        threshold = clf.tree_.threshold[node_id]

        if feature != _tree.TREE_UNDEFINED:
            fname = feature_names[feature]
            val = X_trans[0, feature]
            if val <= threshold:
                print(f"- {fname} = {val:.3f} <= {threshold:.3f}  → go LEFT")
            else:
                print(f"- {fname} = {val:.3f} >  {threshold:.3f}  → go RIGHT")

    proba = pipeline.predict_proba(single_row)[0]
    pred = pipeline.predict(single_row)[0]
    print("\nPrediction:", "Churn" if pred==1 else "No Churn")
    print("Probabilities [NoChurn, Churn]:", proba)

# pick one test sample
sample = X_test.sample(1, random_state=1)
explain_tree_decision(tree_model, sample)


Customer input:
  MonthlyUsageHours  MonthlyBillPKR  Complaints  TenureMonths ContractType
              50.7          2326.0           2            16      Monthly 

Decision path (rules):
- num__MonthlyBillPKR = 2326.000 <= 2560.000  → go LEFT
- num__MonthlyBillPKR = 2326.000 >  2114.000  → go RIGHT
- num__MonthlyBillPKR = 2326.000 >  2158.500  → go RIGHT
- num__MonthlyBillPKR = 2326.000 >  2316.000  → go RIGHT

Prediction: No Churn
Probabilities [NoChurn, Churn]: [0.61290323 0.38709677]


In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

svm_model = Pipeline(steps=[
    ("prep", preprocess),
    ("scale", StandardScaler(with_mean=False)),  # sparse-safe
    ("clf", SVC(kernel="rbf", C=5, gamma="scale", probability=True))
])

svm_model.fit(X_train, y_train)
pred_svm = svm_model.predict(X_test)

print("SVM (RBF) Accuracy:", accuracy_score(y_test, pred_svm))
print(confusion_matrix(y_test, pred_svm))
print(classification_report(y_test, pred_svm))


SVM (RBF) Accuracy: 0.8266666666666667
[[32  7]
 [ 6 30]]
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        39
           1       0.81      0.83      0.82        36

    accuracy                           0.83        75
   macro avg       0.83      0.83      0.83        75
weighted avg       0.83      0.83      0.83        75



In [7]:
sample2 = X_test.sample(1, random_state=7)
proba2 = svm_model.predict_proba(sample2)[0]
pred2 = svm_model.predict(sample2)[0]
print(sample2)
print("\nPrediction:", "Churn" if pred2==1 else "No Churn")
print("Probabilities [NoChurn, Churn]:", proba2)



     MonthlyUsageHours  MonthlyBillPKR  Complaints  TenureMonths ContractType
254               29.6          2770.0           2            39      Monthly

Prediction: Churn
Probabilities [NoChurn, Churn]: [0.2586314 0.7413686]
