### AdaBoosting

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import SpectralClustering
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform

In [2]:
df = pd.read_csv("student_sleep_patterns.csv")

# Stratified train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Sleep_Quality'])


In [3]:
# Bin Sleep_Quality into 3 categories
bins = [df['Sleep_Quality'].min() - 1, 4, 7, df['Sleep_Quality'].max() + 1]
labels = [0, 1, 2]  # 0 = Low, 1 = Medium, 2 = High
train_df['Sleep_Quality_Category'] = pd.cut(train_df['Sleep_Quality'], bins=bins, labels=labels).astype(int)
test_df['Sleep_Quality_Category'] = pd.cut(test_df['Sleep_Quality'], bins=bins, labels=labels).astype(int)

# Create binary label: 1 if High sleep quality, 0 otherwise
train_df['Sleep_Quality_Binary'] = train_df['Sleep_Quality_Category'].apply(lambda x: 1 if x == 2 else 0)
test_df['Sleep_Quality_Binary'] = test_df['Sleep_Quality_Category'].apply(lambda x: 1 if x == 2 else 0)


In [4]:
# Drop Student_ID if exists
for df_ in [train_df, test_df]:
    df_.drop(columns=[col for col in ['Student_ID'] if col in df_.columns], inplace=True)


In [5]:
# Encode categorical columns
cat_cols = train_df.select_dtypes(include='object').columns
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = test_df[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
    le_dict[col] = le

In [6]:
# Separate features and target
X_train = train_df.drop(columns=['Sleep_Quality', 'Sleep_Quality_Category', 'Sleep_Quality_Binary'])
y_train = train_df['Sleep_Quality_Binary']
X_test = test_df.drop(columns=['Sleep_Quality', 'Sleep_Quality_Category', 'Sleep_Quality_Binary'])
y_test = test_df['Sleep_Quality_Binary']


In [7]:
# Standardize for clustering
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Spectral Clustering
n_clusters = 3
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=42)
train_clusters = spectral.fit_predict(X_train_scaled)

# Use RandomForestClassifier to mimic clustering for test set
from sklearn.ensemble import RandomForestClassifier
rf_cluster = RandomForestClassifier(random_state=42, n_estimators=100)
rf_cluster.fit(X_train, train_clusters)
test_clusters = rf_cluster.predict(X_test)

# Add cluster as a new feature
X_train['Spectral_Cluster'] = train_clusters
X_test['Spectral_Cluster'] = test_clusters




In [8]:
# Apply SMOTE to balance binary classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


#### Before Smoting

In [10]:


# AdaBoost setup with DecisionTree base
base_tree = DecisionTreeClassifier(max_depth=1)
adaboost = AdaBoostClassifier(estimator=base_tree, algorithm='SAMME', random_state=42)

param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 1.0)
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=adaboost,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)
random_search.fit(X_train, y_train)

# Best model
best_ada = random_search.best_estimator_
print(f"Best Hyperparameters: {random_search.best_params_}")

# Predictions on full feature set
y_train_pred = best_ada.predict(X_train)
y_test_pred = best_ada.predict(X_test)
print(f"\nTrain Accuracy (all features): {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy (all features): {accuracy_score(y_test, y_test_pred):.4f}")

# Feature importance
importances = best_ada.feature_importances_
features = X_train.columns
feat_imp_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Test accuracy for different k
test_accuracies = []
for k in range(1, len(features) + 1):
    top_features = feat_imp_df['Feature'].iloc[:k].tolist()
    ada_top = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                                 algorithm='SAMME',
                                 random_state=42, **random_search.best_params_)
    ada_top.fit(X_train[top_features], y_train)
    y_test_pred_k = ada_top.predict(X_test[top_features])
    acc = accuracy_score(y_test, y_test_pred_k)
    test_accuracies.append(acc)

best_k = np.argmax(test_accuracies) + 1
print(f"\nBest number of features: {best_k} with Test Accuracy: {test_accuracies[best_k - 1]:.4f}")

# Final AdaBoost with best_k features
best_features = feat_imp_df['Feature'].iloc[:best_k].tolist()
final_ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                               algorithm='SAMME',
                               random_state=42, **random_search.best_params_)
final_ada.fit(X_train[best_features], y_train)

y_train_pred_best = final_ada.predict(X_train[best_features])
y_test_pred_best = final_ada.predict(X_test[best_features])

# Evaluation
print(f"\nTrain Accuracy (best features): {accuracy_score(y_train, y_train_pred_best):.4f}")
print(f"Test Accuracy (best features): {accuracy_score(y_test, y_test_pred_best):.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred_best))
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred_best))

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_best))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_best))
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'learning_rate': 0.06808361216819946, 'n_estimators': 137}

Train Accuracy (all features): 0.7100
Test Accuracy (all features): 0.7100

Best number of features: 1 with Test Accuracy: 0.7100

Train Accuracy (best features): 0.7100
Test Accuracy (best features): 0.7100

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        71
           1       0.00      0.00      0.00        29

    accuracy                           0.71       100
   macro avg       0.35      0.50      0.42       100
weighted avg       0.50      0.71      0.59       100

Confusion Matrix (Test):
[[71  0]
 [29  0]]
Train RMSE: 0.5385
Test RMSE: 0.5385


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### after Smoting 

In [12]:


# AdaBoost setup with DecisionTree base
base_tree = DecisionTreeClassifier(max_depth=1)
adaboost = AdaBoostClassifier(estimator=base_tree, algorithm='SAMME', random_state=42)

param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 1.0)
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=adaboost,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)
random_search.fit(X_train_resampled, y_train_resampled)

# Best model
best_ada = random_search.best_estimator_
print(f"Best Hyperparameters: {random_search.best_params_}")

# Predictions on full feature set
y_train_pred = best_ada.predict(X_train)
y_test_pred = best_ada.predict(X_test)
print(f"\nTrain Accuracy (all features): {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy (all features): {accuracy_score(y_test, y_test_pred):.4f}")

# Feature importance
importances = best_ada.feature_importances_
features = X_train.columns
feat_imp_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Test accuracy for different k
test_accuracies = []
for k in range(1, len(features) + 1):
    top_features = feat_imp_df['Feature'].iloc[:k].tolist()
    ada_top = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                                 algorithm='SAMME',
                                 random_state=42, **random_search.best_params_)
    ada_top.fit(X_train[top_features], y_train)
    y_test_pred_k = ada_top.predict(X_test[top_features])
    acc = accuracy_score(y_test, y_test_pred_k)
    test_accuracies.append(acc)

best_k = np.argmax(test_accuracies) + 1
print(f"\nBest number of features: {best_k} with Test Accuracy: {test_accuracies[best_k - 1]:.4f}")

# Final AdaBoost with best_k features
best_features = feat_imp_df['Feature'].iloc[:best_k].tolist()
final_ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                               algorithm='SAMME',
                               random_state=42, **random_search.best_params_)
final_ada.fit(X_train[best_features], y_train)

y_train_pred_best = final_ada.predict(X_train[best_features])
y_test_pred_best = final_ada.predict(X_test[best_features])

# Evaluation
print(f"\nTrain Accuracy (best features): {accuracy_score(y_train, y_train_pred_best):.4f}")
print(f"Test Accuracy (best features): {accuracy_score(y_test, y_test_pred_best):.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred_best))
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred_best))

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_best))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_best))
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'learning_rate': 0.7180725777960455, 'n_estimators': 199}

Train Accuracy (all features): 0.6550
Test Accuracy (all features): 0.6000

Best number of features: 1 with Test Accuracy: 0.7100

Train Accuracy (best features): 0.7300
Test Accuracy (best features): 0.7100

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        71
           1       0.00      0.00      0.00        29

    accuracy                           0.71       100
   macro avg       0.35      0.50      0.42       100
weighted avg       0.50      0.71      0.59       100

Confusion Matrix (Test):
[[71  0]
 [29  0]]
Train RMSE: 0.5196
Test RMSE: 0.5385


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
