In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Sheet_3_Q1_Assig_Complete Pipeline with Preprocessing?

In [None]:
data = load_breast_cancer()
X, y = data.data, data.target

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=500))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

acc = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test, pred)

print("Accuracy:", acc)
print("Confusion Matrix:")
print(cm)


Accuracy: 0.9736842105263158
Confusion Matrix:
[[41  2]
 [ 1 70]]


Sheet_3_Assig_Q2: Model Interpretation - Feature Importance

In [None]:
model = pipe.named_steps['model']
scaler = pipe.named_steps['scaler']

coefs = model.coef_[0]
feature_names = data.feature_names

sorted_idx = np.argsort(abs(coefs))[::-1]

print("Top Important Features:")
for i in range(10):
    print(feature_names[sorted_idx[i]], ":", coefs[sorted_idx[i]])

Top Important Features:
worst texture : -1.3506055922623306
radius error : -1.2681781455073828
worst symmetry : -1.2082003069317806
mean concave points : -1.1198040756502856
worst concavity : -0.9430531344506452
area error : -0.9071857032866862
worst radius : -0.879840235250774
worst area : -0.8418459407075642
mean concavity : -0.8014580988560307
worst concave points : -0.7782172634838923


*Sheet_4_Q4*: Model Interpretation - Feature Importance

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

rf_imp = rf.feature_importances_
rf_idx = np.argsort(rf_imp)[::-1][:5]
print("RF Top 5:")
for i in rf_idx:
    print(data.feature_names[i], rf_imp[i])

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

gb_imp = gb.feature_importances_
gb_idx = np.argsort(gb_imp)[::-1][:5]
print("\nGB Top 5:")
for i in gb_idx:
    print(data.feature_names[i], gb_imp[i])

RF Top 5:
worst area 0.15389236463205394
worst concave points 0.14466326620735528
mean concave points 0.10620998844591638
worst radius 0.07798687515738047
mean concavity 0.06800084191430111

GB Top 5:
mean concave points 0.4505275876506107
worst concave points 0.24010308555229953
worst radius 0.075588875185215
worst perimeter 0.05140821113749899
worst texture 0.03988648003422685


Assignment â€“ Model Comparison (Decision Tree / Random Forest / Gradient Boosting)

In [None]:
data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)

full_train_acc = accuracy_score(y_train, dt_full.predict(X_train))
full_test_acc = accuracy_score(y_test, dt_full.predict(X_test))

print("Full Decision Tree:")
print("Train Accuracy:", full_train_acc)
print("Test Accuracy:", full_test_acc)

dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_pruned.fit(X_train, y_train)

pruned_train_acc = accuracy_score(y_train, dt_pruned.predict(X_train))
pruned_test_acc = accuracy_score(y_test, dt_pruned.predict(X_test))

print("\nPruned Decision Tree (max_depth=3):")
print("Train Accuracy:", pruned_train_acc)
print("Test Accuracy:", pruned_test_acc)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_train_acc = accuracy_score(y_train, rf.predict(X_train))
rf_test_acc = accuracy_score(y_test, rf.predict(X_test))

print("\nRandom Forest (100 trees):")
print("Train Accuracy:", rf_train_acc)
print("Test Accuracy:", rf_test_acc)


Full Decision Tree:
Train Accuracy: 1.0
Test Accuracy: 0.9122807017543859

Pruned Decision Tree (max_depth=3):
Train Accuracy: 0.9758241758241758
Test Accuracy: 0.9385964912280702

Random Forest (100 trees):
Train Accuracy: 1.0
Test Accuracy: 0.956140350877193
