# HW8 on Data Science course of Sharif University of Technology
## Created by: Mohammad Mahdi Hossein Beiky     SI: 400100995
## GitHub URL: https://github.com/Mmhb1382/Data_Science_HW8.git
---

In [5]:
# hw8_task1_svm.py

"""
Task 1: Multiclass SVM on the Ames Housing Dataset

Steps:
 1. Load the raw CSV into a pandas DataFrame.
 2. Create a 4‐class target ('PriceClass') by quartiling SalePrice.
 3. Build preprocessing pipelines to median‐impute & scale numerics, and to fill/one‐hot encode categoricals.
 4. Apply the pipelines and assemble `final_df` (features + 'PriceClass').
 5. Split `final_df` into train/test sets (80/20), stratified on 'PriceClass'.
 6. Fit a LinearSVC on the preprocessed features.
 7. Evaluate with macro‐F1 and output a full classification report.
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report

# ──────────────────────────────────────────────────────────────────────────────
# 1) Load the dataset
df = pd.read_csv('AmesHousing.csv')  # adjust path if needed

# ──────────────────────────────────────────────────────────────────────────────
# 2) Create the multiclass target
#    Split SalePrice into 4 equal‐size bins labeled 0,1,2,3
df['PriceClass'] = pd.qcut(df['SalePrice'], q=4, labels=False)

# ──────────────────────────────────────────────────────────────────────────────
# 3) Prepare data for preprocessing
X_raw = df.drop(['SalePrice', 'PriceClass'], axis=1)
y     = df['PriceClass']

# Identify numeric vs. categorical columns
numeric_features     = X_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_raw.select_dtypes(include=['object']).columns.tolist()

# ──────────────────────────────────────────────────────────────────────────────
# 4) Build preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # fill gaps with median
    ('scaler',   StandardScaler()),                  # standardize features
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline,     numeric_features),
    ('cat', categorical_pipeline, categorical_features),
])

# Apply preprocessing to raw features
X_processed = preprocessor.fit_transform(X_raw)

# Retrieve feature names for the one‐hot columns
ohe       = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_names = ohe.get_feature_names_out(categorical_features)
feature_names = numeric_features + list(cat_names)

# Build final_df containing all processed features
final_df = pd.DataFrame(X_processed, columns=feature_names, index=X_raw.index)

# Attach the multiclass target to final_df
final_df['PriceClass'] = y

# ──────────────────────────────────────────────────────────────────────────────
# 5) Train/test split (stratified by PriceClass)
X_train, X_test, y_train, y_test = train_test_split(
    final_df.drop('PriceClass', axis=1),
    final_df['PriceClass'],
    test_size=0.20,
    random_state=42,
    stratify=final_df['PriceClass']
)

# ──────────────────────────────────────────────────────────────────────────────
# 6) Train a Linear SVM
svm = LinearSVC(max_iter=10000, random_state=42)
svm.fit(X_train, y_train)

# ──────────────────────────────────────────────────────────────────────────────
# 7) Evaluate performance
y_pred   = svm.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average='macro')

print(f"✅ Macro F1‐score: {macro_f1:.3f}  (goal ≥ 0.625)\n")
print("Full classification report:\n")
print(classification_report(y_test, y_pred))


✅ Macro F1‐score: 0.753  (goal ≥ 0.625)

Full classification report:

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       148
           1       0.67      0.62      0.65       146
           2       0.70      0.66      0.68       146
           3       0.83      0.88      0.86       146

    accuracy                           0.76       586
   macro avg       0.75      0.76      0.75       586
weighted avg       0.75      0.76      0.75       586



In [8]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 2.1 — Logistic Regression (OVR)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

# 1) Build a base LogisticRegression (no multi_class argument)
base_lr = LogisticRegression(
    solver='lbfgs',
    max_iter=10000,
    random_state=42
)

# 2) Wrap it in OneVsRestClassifier to get OVR behavior
logreg_ovr = OneVsRestClassifier(base_lr)

# 3) Fit on the preprocessed training data
logreg_ovr.fit(X_train, y_train)

# 4) Predict & evaluate
y_pred_ovr = logreg_ovr.predict(X_test)
f1_ovr     = f1_score(y_test, y_pred_ovr, average='macro')

print(f"✅ OVR Logistic Regression macro-F1: {f1_ovr:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (OVR):\n", classification_report(y_test, y_pred_ovr))


✅ OVR Logistic Regression macro-F1: 0.764  (goal ≥ 0.625)

Classification Report (OVR):
               precision    recall  f1-score   support

           0       0.78      0.85      0.82       148
           1       0.67      0.61      0.64       146
           2       0.74      0.68      0.71       146
           3       0.86      0.92      0.89       146

    accuracy                           0.77       586
   macro avg       0.76      0.77      0.76       586
weighted avg       0.76      0.77      0.76       586



In [14]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 2.2 — Tuned Multinomial Logistic Regression (no warning, higher F1)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report

# Base LR (solver='saga' supports both l1 & l2 penalties)
base_lr = LogisticRegression(
    solver='saga',
    max_iter=20000,
    random_state=42
)

# Grid over penalty type, regularization C, and class weights
param_grid = {
    'penalty':      ['l2'],               # drop 'l1' for now
    'C':            [0.1, 1, 10],         # fewer values
    'class_weight': ['balanced', None]
}
grid = GridSearchCV(
    base_lr,
    param_grid,
    cv=3,                 # 3-fold instead of 5-fold
    scoring='f1_macro',
    n_jobs=-1
)

grid.fit(X_train, y_train)

# Pull out the best model
best_lr  = grid.best_estimator_
best_params = grid.best_params_

# Evaluate on test set
y_pred   = best_lr.predict(X_test)
f1_best  = f1_score(y_test, y_pred, average='macro')

print(f"🏆 Best parameters: {best_params}")
print(f"✅ Tuned Logistic Regression macro-F1: {f1_best:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (tuned):\n", classification_report(y_test, y_pred))


🏆 Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
✅ Tuned Logistic Regression macro-F1: 0.800  (goal ≥ 0.625)

Classification Report (tuned):
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       148
           1       0.72      0.66      0.69       146
           2       0.76      0.75      0.76       146
           3       0.90      0.91      0.90       146

    accuracy                           0.80       586
   macro avg       0.80      0.80      0.80       586
weighted avg       0.80      0.80      0.80       586



In [16]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 2.3 — Evaluate Tuned Logistic Regression (F1 + Log-Loss)

from sklearn.metrics import f1_score, log_loss, classification_report

# 1) Predict classes and probabilities with the tuned model
y_pred  = best_lr.predict(X_test)
y_proba = best_lr.predict_proba(X_test)

# 2) Compute macro-F1 (should match your ~0.80 from Task 2.2)
f1    = f1_score(y_test, y_pred, average='macro')
# 3) Compute multiclass log‐loss
ll    = log_loss(y_test, y_proba)

# 4) Report both
print(f"✅ Tuned Logistic Regression macro-F1: {f1:.3f}")
print(f"📉 Tuned Logistic Regression log-loss: {ll:.3f}\n")
print("Classification Report (Tuned):\n", classification_report(y_test, y_pred))


✅ Tuned Logistic Regression macro-F1: 0.800
📉 Tuned Logistic Regression log-loss: 0.540

Classification Report (Tuned):
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       148
           1       0.72      0.66      0.69       146
           2       0.76      0.75      0.76       146
           3       0.90      0.91      0.90       146

    accuracy                           0.80       586
   macro avg       0.80      0.80      0.80       586
weighted avg       0.80      0.80      0.80       586



In [18]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 3 — Supress warnings & hyper-tune KNN for max macro-F1
import warnings
warnings.filterwarnings('ignore')  # silence Loky/user warnings

from sklearn.neighbors      import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics        import f1_score, classification_report

# Expanded grid: try different k’s, both uniform/distance, Minkowski p=1 or 2
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights':     ['uniform', 'distance'],
    'p':           [1, 2]
}

grid_knn = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=3,              # 3-fold CV to speed up
    scoring='f1_macro',
    n_jobs=1,          # avoid Loky warnings by not parallelizing
    verbose=0
)
grid_knn.fit(X_train, y_train)

# Grab the best model and evaluate on the test set
best_knn  = grid_knn.best_estimator_
best_params = grid_knn.best_params_
y_pred     = best_knn.predict(X_test)
f1_best    = f1_score(y_test, y_pred, average='macro')

print(f"🏆 Best KNN params: {best_params}")
print(f"✅ Tuned KNN macro-F1: {f1_best:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (Tuned KNN):\n", classification_report(y_test, y_pred))


🏆 Best KNN params: {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
✅ Tuned KNN macro-F1: 0.738  (goal ≥ 0.625)

Classification Report (Tuned KNN):
               precision    recall  f1-score   support

           0       0.76      0.83      0.79       148
           1       0.62      0.62      0.62       146
           2       0.68      0.71      0.69       146
           3       0.91      0.79      0.85       146

    accuracy                           0.74       586
   macro avg       0.74      0.74      0.74       586
weighted avg       0.74      0.74      0.74       586



In [19]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 4.1 — Multiclass Decision Tree (baseline)
"""
Train a basic Decision Tree on your fully-preprocessed features,
then evaluate its macro-F1 on the held-out test set.
"""

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report

# 1) Instantiate & train the tree
dt_baseline = DecisionTreeClassifier(random_state=42)
dt_baseline.fit(X_train, y_train)

# 2) Predict & compute macro-F1
y_pred_dt   = dt_baseline.predict(X_test)
f1_dt       = f1_score(y_test, y_pred_dt, average='macro')

print(f"✅ Decision Tree (baseline) macro-F1: {f1_dt:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (baseline DT):\n", classification_report(y_test, y_pred_dt))


✅ Decision Tree (baseline) macro-F1: 0.697  (goal ≥ 0.625)

Classification Report (baseline DT):
               precision    recall  f1-score   support

           0       0.76      0.72      0.74       148
           1       0.58      0.62      0.60       146
           2       0.63      0.62      0.63       146
           3       0.83      0.82      0.83       146

    accuracy                           0.70       586
   macro avg       0.70      0.70      0.70       586
weighted avg       0.70      0.70      0.70       586



In [21]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 4.2 — Randomized tuning for Decision Tree to maximize macro-F1

import warnings
warnings.filterwarnings('ignore')   # quiet any loky / deprecation noise

from sklearn.tree             import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics         import f1_score, classification_report
from scipy.stats             import randint

# Base classifier
dt = DecisionTreeClassifier(random_state=42)

# Parameter distribution
param_dist = {
    'criterion':          ['gini', 'entropy', 'log_loss'],
    'max_depth':          [None, 5, 10, 15, 20, 30],
    'min_samples_split':  randint(2, 20),
    'min_samples_leaf':   randint(1, 10),
    'class_weight':       [None, 'balanced']
}

# Randomized search (20 candidates, 5-fold CV)
rand_dt = RandomizedSearchCV(
    dt,
    param_dist,
    n_iter=20,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42,
    verbose=0
)
rand_dt.fit(X_train, y_train)

# Evaluate best model
best_dt     = rand_dt.best_estimator_
best_params = rand_dt.best_params_
y_pred      = best_dt.predict(X_test)
f1_best     = f1_score(y_test, y_pred, average='macro')

print(f"🏆 Best DT params: {best_params}")
print(f"✅ Tuned Decision Tree macro-F1: {f1_best:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (tuned DT):\n", classification_report(y_test, y_pred))


🏆 Best DT params: {'class_weight': None, 'criterion': 'log_loss', 'max_depth': 30, 'min_samples_leaf': 7, 'min_samples_split': 9}
✅ Tuned Decision Tree macro-F1: 0.718  (goal ≥ 0.625)

Classification Report (tuned DT):
               precision    recall  f1-score   support

           0       0.76      0.76      0.76       148
           1       0.61      0.59      0.60       146
           2       0.67      0.70      0.68       146
           3       0.84      0.82      0.83       146

    accuracy                           0.72       586
   macro avg       0.72      0.72      0.72       586
weighted avg       0.72      0.72      0.72       586



In [22]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 5.1 — XGBoost (baseline)
"""
Train a basic XGBClassifier on the preprocessed features,
then evaluate its macro-F1 on the test set.
"""
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# 1) Instantiate & train
xgb = XGBClassifier(
    use_label_encoder=False,    # suppress deprecation warning
    eval_metric='mlogloss',     # multiclass logloss
    random_state=42
)
xgb.fit(X_train, y_train)

# 2) Predict & evaluate
y_pred_xgb = xgb.predict(X_test)
f1_xgb     = f1_score(y_test, y_pred_xgb, average='macro')
print(f"✅ XGBoost macro-F1: {f1_xgb:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (XGBoost):\n", classification_report(y_test, y_pred_xgb))

✅ XGBoost macro-F1: 0.792  (goal ≥ 0.625)

Classification Report (XGBoost):
               precision    recall  f1-score   support

           0       0.84      0.82      0.83       148
           1       0.68      0.67      0.67       146
           2       0.76      0.77      0.77       146
           3       0.89      0.90      0.89       146

    accuracy                           0.79       586
   macro avg       0.79      0.79      0.79       586
weighted avg       0.79      0.79      0.79       586



In [24]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 5.2 — LightGBM (baseline)
"""
Train a basic LGBMClassifier on the preprocessed features,
then evaluate its macro-F1 on the test set.
"""
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report

# 1) Instantiate & train
lgb = LGBMClassifier(random_state=42)
lgb.fit(X_train, y_train)

# 2) Predict & evaluate
y_pred_lgb = lgb.predict(X_test)
f1_lgb     = f1_score(y_test, y_pred_lgb, average='macro')
print(f"✅ LightGBM macro-F1: {f1_lgb:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (LightGBM):\n", classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4320
[LightGBM] [Info] Number of data points in the train set: 2344, number of used features: 222
[LightGBM] [Info] Start training from score -1.377798
[LightGBM] [Info] Start training from score -1.393144
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.388002
✅ LightGBM macro-F1: 0.794  (goal ≥ 0.625)

Classification Report (LightGBM):
               precision    recall  f1-score   support

           0       0.84      0.82      0.83       148
           1       0.68      0.71      0.69       146
           2       0.77      0.76      0.76       146
           3       0.89      0.88      0.89       146

    accuracy                           0.79       586
   macro avg       0.79   

In [28]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 5.3 — AdaBoost (baseline)
"""
Train a basic AdaBoostClassifier on the preprocessed features,
then evaluate its macro-F1 on the test set.
"""
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics    import f1_score, classification_report

# 1) Instantiate & train
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train, y_train)

# 2) Predict & evaluate
y_pred_ada = ada.predict(X_test)
f1_ada     = f1_score(y_test, y_pred_ada, average='macro')
print(f"✅ AdaBoost macro-F1: {f1_ada:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (AdaBoost):\n", classification_report(y_test, y_pred_ada))


✅ AdaBoost macro-F1: 0.665  (goal ≥ 0.625)

Classification Report (AdaBoost):
               precision    recall  f1-score   support

           0       0.78      0.64      0.70       148
           1       0.54      0.65      0.59       146
           2       0.57      0.74      0.64       146
           3       0.90      0.61      0.73       146

    accuracy                           0.66       586
   macro avg       0.70      0.66      0.66       586
weighted avg       0.70      0.66      0.66       586



In [27]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 5.3b — Tune AdaBoost for macro-F1 (fixed estimator argument)

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble        import AdaBoostClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics         import f1_score, classification_report

# 1) Tunable base estimator
base_tree = DecisionTreeClassifier(random_state=42)

# Note: modern sklearn uses `estimator=` instead of `base_estimator=`
ada = AdaBoostClassifier(
    estimator=base_tree,
    random_state=42
)

# 2) Hyperparameter grid
param_grid = {
    'n_estimators':            [50, 100, 200, 500],
    'learning_rate':           [0.01, 0.1, 0.5, 1.0],
    'estimator__max_depth':    [1, 2, 3]
}

# 3) GridSearchCV (3-fold CV, macro-F1)
grid_ada = GridSearchCV(
    ada,
    param_grid,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
grid_ada.fit(X_train, y_train)

# 4) Evaluate best model
best_ada    = grid_ada.best_estimator_
best_params = grid_ada.best_params_
y_pred_ada  = best_ada.predict(X_test)
f1_ada      = f1_score(y_test, y_pred_ada, average='macro')

print(f"🏆 Best AdaBoost params: {best_params}")
print(f"✅ Tuned AdaBoost macro-F1: {f1_ada:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (Tuned AdaBoost):\n", classification_report(y_test, y_pred_ada))


🏆 Best AdaBoost params: {'estimator__max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 500}
✅ Tuned AdaBoost macro-F1: 0.791  (goal ≥ 0.625)

Classification Report (Tuned AdaBoost):
               precision    recall  f1-score   support

           0       0.85      0.76      0.80       148
           1       0.64      0.73      0.69       146
           2       0.76      0.79      0.77       146
           3       0.93      0.88      0.90       146

    accuracy                           0.79       586
   macro avg       0.80      0.79      0.79       586
weighted avg       0.80      0.79      0.79       586



In [26]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell: Task 5.4 — Grid Search to Tune XGBoost
"""
Grid-search key XGBoost hyperparameters (n_estimators, max_depth, learning_rate)
to maximize macro-F1, then evaluate the best model on the test set.
"""
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics        import f1_score, classification_report

# 1) Define grid
param_grid = {
    'n_estimators':   [100, 200],
    'max_depth':      [3, 5, 7],
    'learning_rate':  [0.01, 0.1, 0.2]
}

# 2) Grid search (3-fold CV, optimizing macro-F1)
grid_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    param_grid,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
grid_xgb.fit(X_train, y_train)

# 3) Extract best model
best_xgb     = grid_xgb.best_estimator_
best_params  = grid_xgb.best_params_

# 4) Predict & evaluate
y_pred_best  = best_xgb.predict(X_test)
f1_best      = f1_score(y_test, y_pred_best, average='macro')

print(f"🏆 Best XGBoost params: {best_params}")
print(f"✅ Tuned XGBoost macro-F1: {f1_best:.3f}  (goal ≥ 0.625)\n")
print("Classification Report (Tuned XGBoost):\n", classification_report(y_test, y_pred_best))


🏆 Best XGBoost params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
✅ Tuned XGBoost macro-F1: 0.800  (goal ≥ 0.625)

Classification Report (Tuned XGBoost):
               precision    recall  f1-score   support

           0       0.86      0.84      0.85       148
           1       0.69      0.71      0.70       146
           2       0.76      0.75      0.76       146
           3       0.89      0.90      0.89       146

    accuracy                           0.80       586
   macro avg       0.80      0.80      0.80       586
weighted avg       0.80      0.80      0.80       586



### I applied GridSearch on AdaBoost and after one minute the resultant F1-score has been risen up from 0.665 to 0.791

## Task 6: Extending KNN and Decision Trees to Multi-Label Classification

In a **multi-label** setting, each instance can belong to *zero, one, or multiple* classes simultaneously (e.g. a house might be categorized as both “luxury” *and* “vintage”). To adapt our KNN and Decision-Tree pipelines:

---

### 1. KNN → ML-KNN (Multi-Label KNN)

- **Idea:** Instead of a single `y` per sample, we have a boolean vector `y ∈ {0,1}^L` for L labels.
- **Algorithm (ML-KNN):**
  1. For each test point, find its *k* nearest neighbors in feature space.
  2. Count how many of those neighbors have each label *j* (call that count \( c_j \)).
  3. Use a Bayesian update (with smoothing) to estimate
     \[
       P(\text{label}_j = 1 \mid c_j)
     \]
     and threshold these probabilities to decide which labels to assign.
- **Implementation in scikit-learn:**
  ```python
  from skmultilearn.adapt import MLkNN
  knn_ml = MLkNN(k=5)
  knn_ml.fit(X_train, Y_train_multi)      # Y_train_multi: shape (n_samples, n_labels)
  Y_pred       = knn_ml.predict(X_test)
