Q1

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

full_tree = DecisionTreeClassifier()
full_tree.fit(X_train, y_train)
#Bashouf el accuracy 3ala el training w el testing

train_acc_full = full_tree.score(X_train, y_train)
test_acc_full = full_tree.score(X_test, y_test)

print("Full Decision Tree:")
print("Train Accuracy:", train_acc_full)
print("Test Accuracy :", test_acc_full)

plt.figure(figsize=(10,8))

plot_tree(full_tree, filled=True, fontsize=6)
plt.show()
#This tree is limited to depth 3 which is less overfitting
pruned_tree = DecisionTreeClassifier(max_depth=3)
pruned_tree.fit(X_train, y_train)

train_acc_pruned = pruned_tree.score(X_train, y_train)
test_acc_pruned = pruned_tree.score(X_test, y_test)

print("nPruned Decision Tree:")
print("Train Accuracy:", train_acc_pruned)
print("Test Accuracy :", test_acc_pruned)


The full tree learns too much and overfits, so train accuracy is high but test accuracy is so low.
The pruned tree is simpler and more balanced, giving better generalization on test data.

Q2

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
#Creates 100 decision trees and combines them
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
#Zay el tree, bas accuracy a7san and more stable
rf_train_acc = rf.score(X_train, y_train)
rf_test_acc = rf.score(X_test, y_test)

print("Random Forest:")
print("Train Accuracy:", rf_train_acc)
print("Test Accuracy :", rf_test_acc)

print("nConfusion Matrix:")
print(confusion_matrix(y_test, rf.predict(X_test)))

print("nClassification Report:")
print(classification_report(y_test, rf.predict(X_test)))

print("nComparison:")
print("Full Tree Test Acc   :", test_acc_full)
print("Pruned Tree Test Acc :", test_acc_pruned)
print("Random Forest Test Acc:", rf_test_acc)


The Random Forest is more stable and accurate because it uses many trees together.
It reduces overfitting and usually performs better than a single decision tree

Q3

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
#Builds trees one by one, each tree fixes mistakes of the previous one
gb_default = GradientBoostingClassifier()
gb_default.fit(X_train, y_train)

print("Gradient Boosting:")
print("Train Accuracy:", gb_default.score(X_train, y_train))
print("Test Accuracy :", gb_default.score(X_test, y_test))
#We try different settings and see which is best
learning_rates = [0.01, 0.1]
estimators = [50, 100, 200]

print("nTesting Learning Rates and Estimators:")
for lr in learning_rates:
    for n in estimators:
        gb = GradientBoostingClassifier(learning_rate=lr, n_estimators=n)
        gb.fit(X_train, y_train)
        print("nlearning_rate =", lr, " n_estimators", n)
        print("Train Accuracy:", gb.score(X_train, y_train))
        print("Test Accuracy :", gb.score(X_test, y_test))


Increasing the number of estimators makes the model stronger but may overfit.
Lower learning rate makes training slower but usually improves generalization.

Q4

In [None]:
import numpy as np

print("nTop 5 Feature Importances (Random Forest):")
#Shows which features are most important in predictions
rf_importances = rf.feature_importances_
top5_rf = np.argsort(rf_importances)[-5:]
for i in top5_rf:
    print(data.feature_names[i], ":", rf_importances[i])
#take the last 5 = biggest 5 values
gb_importances = gb_default.feature_importances_
top5_gb = np.argsort(gb_importances)[-5:]

print("nTop 5 Feature Importances (Gradient Boosting):")
for i in top5_gb:
    print(data.feature_names[i], ":", gb_importances[i])


Random Forest and Gradient Boosting both highlight similar important features

Gradient Boosting often gives slightly clearer importance ranking