# IT24100967_Logistic_regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    roc_auc_score
)

# 1. Load Data
file_path = "preprocessed_stress_level_dataset.csv"

df = pd.read_csv(file_path)


# 2. Define Features (X) and Target (y)
target_column = 'stress_level'


X = df.drop(target_column, axis=1)
y = df[target_column]
class_names = [f'Class {c}' for c in sorted(y.unique())]

print(f"Dataset shape: {df.shape}")
print(f"Features: {len(X.columns)}, Target: {target_column}")
print(f"Target classes: {sorted(y.unique())}\n")

# 3. Split Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}\n")

# 4. Helper function to evaluate models
# This avoids repeating the same print/plot code for every model
def evaluate_model(model, X_test, y_test, model_name):
    print(f"--- Evaluation for: {model_name} ---")

    # Get predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)

    # 1. Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"1. Accuracy: {accuracy:.4f}")

    # 2. Classification Report (Precision, Recall, F1-Score)
    print("\n2. Classification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # 3. AUC Score
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    print(f"3. AUC Score (One-vs-Rest): {auc:.4f}")


    # 4. Confusion Matrix
    print("\n4. Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    # Plot the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.savefig(f"cm_{model_name.replace(' ', '_').lower()}.png") # Save the plot
    plt.show() # Display the plot

    print("--------------------------------------------------\n")
    return accuracy, y_prob # Return accuracy for comparison

# To store results for final comparison
model_accuracies = {}

In [None]:
# --- Variation 1: Baseline Model (Default Parameters) ---
# Using 'auto' for multi_class, 'l2' penalty, C=1.0, and 'lbfgs' solver are common defaults.
model_1 = LogisticRegression(random_state=42, max_iter=2000, multi_class='auto')
model_1.fit(X_train, y_train)
acc1, prob1 = evaluate_model(model_1, X_test, y_test, "Variation 1: Baseline (Defaults)")
model_accuracies["Baseline"] = acc1

In [None]:
# --- Variation 2: L1 (Lasso) Regularization ---
# L1 can be used for feature selection. Must use a compatible solver like 'saga'.
# We'll use C=1.0 (standard regularization strength).
model_2 = LogisticRegression(
    random_state=42,
    max_iter=2000,
    multi_class='auto',
    penalty='l1',
    C=1.0,
    solver='saga'
)
model_2.fit(X_train, y_train)
acc2, prob2 = evaluate_model(model_2, X_test, y_test, "Variation 2: L1 (Lasso), C=1.0")
model_accuracies["L1 (Lasso), C=1.0"] = acc2

In [None]:
# --- Variation 3: L2 (Ridge) Regularization (Stronger) ---
# L2 is the default penalty, but we'll use a smaller C (C=0.1).
# A smaller C value means *stronger* regularization.
model_3 = LogisticRegression(
    random_state=42,
    max_iter=2000,
    multi_class='auto',
    penalty='l2',
    C=0.1,
    solver='saga' # Using 'saga' for consistency
)
model_3.fit(X_train, y_train)
acc3, prob3 = evaluate_model(model_3, X_test, y_test, "Variation 3: L2 (Ridge), C=0.1")
model_accuracies["L2 (Ridge), C=0.1"] = acc3

In [None]:
# --- Variation 4: GridSearchCV Tuned Model ---
print("\n--- Starting Variation 4: GridSearchCV Tuning ---")
# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 50],
    'penalty': ['l1', 'l2'],
    'solver': ['saga'] # 'saga' handles both l1 and l2
}
# Base model for grid search
lr_base = LogisticRegression(random_state=42, max_iter=2000, multi_class='auto')

# Set up GridSearch with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=lr_base,
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1, # Use all available cores
    verbose=1
)

# Train the grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

print(f"\nGridSearchCV Best Parameters: {grid_search.best_params_}")
print(f"GridSearchCV Best CV Accuracy: {grid_search.best_score_:.4f}\n")

# Evaluate the best model found by GridSearchCV
acc4, prob4 = evaluate_model(best_model, X_test, y_test, "Variation 4: GridSearchCV-Tuned")
model_accuracies["GridSearchCV-Tuned"] = acc4

In [None]:
# --- 6. Final Comparison ---

# Print a summary of accuracies
print("\n--- Final Model Accuracy Comparison ---")
print(pd.Series(model_accuracies).sort_values(ascending=False))



# IT24104387_KNN

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

# 1. Load Data
file_name = 'preprocessed_stress_level_dataset.csv'
df = pd.read_csv(file_name)

# 2. Define Features (X) and Target (y)
X = df.drop('stress_level', axis=1)
y = df['stress_level']
class_names = [f'Class {c}' for c in sorted(y.unique())]

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}\n")

In [None]:
# Helper function to evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    print(f"--- Evaluation for: {model_name} ---")

    # Get predictions
    y_pred = model.predict(X_test)

    # 1. Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"1. Accuracy: {accuracy:.4f}")

    # 2. Classification Report
    print("\n2. Classification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # 3. Confusion Matrix
    print("\n3. Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show() # Display the plot

    print("--------------------------------------------------\n")
    return accuracy

In [None]:
# To store results for final comparison
model_accuracies = {}

# --- Variation 1: KNN (k=3, uniform weights) ---
# This was the 'best_k' you found in your other notebook
model_1 = KNeighborsClassifier(n_neighbors=3)
model_1.fit(X_train, y_train)
acc1 = evaluate_model(model_1, X_test, y_test, "Variation 1: KNN (k=3)")
model_accuracies["KNN (k=3)"] = acc1

# --- Variation 2: KNN (k=5, uniform weights) ---
# This was the 'initial_knn' you used in your other notebook
model_2 = KNeighborsClassifier(n_neighbors=5)
model_2.fit(X_train, y_train)
acc2 = evaluate_model(model_2, X_test, y_test, "Variation 2: KNN (k=5)")
model_accuracies["KNN (k=5)"] = acc2

# --- Variation 3: KNN (k=3, distance weights) ---
# This is a new variation that weights points by distance
model_3 = KNeighborsClassifier(n_neighbors=3, weights='distance')
model_3.fit(X_train, y_train)
acc3 = evaluate_model(model_3, X_test, y_test, "Variation 3: KNN (k=3, weights='distance')")
model_accuracies["KNN (k=3, distance)"] = acc3

In [None]:
# --- Final Comparison ---

print("\n--- Final Model Accuracy Comparison ---")
# Convert the dictionary to a pandas Series for easy printing
accuracy_summary = pd.Series(model_accuracies).sort_values(ascending=False)
print(accuracy_summary)

# IT24100479_DecisionTree

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.datasets import make_classification  # Not used, but for potential extensions
import seaborn as sns

In [None]:
df = pd.read_csv("preprocessed_stress_level_dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"Class distribution:\n{df['stress_level'].value_counts()}")

In [None]:
# Size of testing and training
def prepare_data(df):
    X = df.iloc[:, :-1].values  # Features: columns 0-11
    y = df['stress_level'].values  # Target
    return X, y

def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    return X_train, X_test, y_train, y_test

X, y = prepare_data(df)
X_train, X_test, y_train, y_test = split_data(X, y)

In [None]:
# Feature names for plotting (assuming 12 features)
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
class_names = ['0', '1', '2']  # Stress levels
target_names = ['Low Stress', 'Medium Stress', 'High Stress']

In [None]:
# Train Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

# Initialize and fit the model with random_state=42 using X_train and y_train
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)

# Print max depth
print(f"Max Depth: {model_dt.get_depth()}")

# Print feature importances as a pandas Series
importances = pd.Series(model_dt.feature_importances_, index=feature_names)
print("Feature Importance:")
print(importances)

In [None]:
# Evaluate Model Performance
# Predict on test set
y_pred_dt = model_dt.predict(X_test)

# Compute metrics (macro-averaged for multi-class)
accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt, average='macro')
recall = recall_score(y_test, y_pred_dt, average='macro')
f1 = f1_score(y_test, y_pred_dt, average='macro')
roc_auc = roc_auc_score(y_test, model_dt.predict_proba(X_test), multi_class='ovr', average='macro')

print(f"Decision Tree Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC AUC: {roc_auc}")

In [None]:
# Plot the Decision Tree structure to visualize its hierarchical splits
plt.figure(figsize=(12, 8))
plot_tree(
    model_dt,
    feature_names=feature_names,
    class_names=class_names,
    filled=True
)
plt.title("Decision Tree Structure")
plt.show()

In [None]:
# Reduce the 12-feature dataset to 2D using PCA for decision boundary visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a Decision Tree model on the 2D PCA-transformed training data to create a decision boundary
model_dt_pca = DecisionTreeClassifier(random_state=42).fit(X_train_pca, y_train)

# Create a mesh grid to predict class labels across the 2D PCA space for the decision boundary
x_min, x_max = X_test_pca[:, 0].min() - 1, X_test_pca[:, 0].max() + 1
y_min, y_max = X_test_pca[:, 1].min() - 1, X_test_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = model_dt_pca.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

In [None]:
# Plot the decision boundary and test data points to show class separation
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
scatter = plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test, cmap='coolwarm', edgecolor='k')
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("Decision Tree Decision Boundary (PCA)")
plt.colorbar(scatter)
plt.show()

In [None]:
# Plot the confusion matrix to visualize classification performance
cm = confusion_matrix(y_test, y_pred_dt)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Experiment with Parameters and Analyze Behavior
# Loop over max_depth values [1, 3, 10] for the Decision Tree model
# For each max_depth, train a model, predict on X_test, and compute accuracy and F1-score

depths = [1, 3, 10]
results = []

for depth in depths:
    # Train model with max_depth=depth
    model_dt_exp = DecisionTreeClassifier(max_depth=depth, random_state=42)
    model_dt_exp.fit(X_train, y_train)
    y_pred_exp = model_dt_exp.predict(X_test)

    # Compute metrics
    acc = accuracy_score(y_test, y_pred_exp)
    precision = precision_score(y_test, y_pred_exp, average='macro')
    recall = recall_score(y_test, y_pred_exp, average='macro')
    f1 = f1_score(y_test, y_pred_exp, average='macro')
    roc_auc = roc_auc_score(y_test, model_dt_exp.predict_proba(X_test), multi_class='ovr', average='macro')

    print(f"Decision Tree (max_depth={depth}) Metrics:")
    print(f"Accuracy: {acc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print(f"ROC AUC: {roc_auc}")

    results.append({'Max Depth': depth, 'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'ROC AUC': roc_auc})

In [None]:
# Print a summary table of results
results_df = pd.DataFrame(results)
print("\nSummary Table:")
print(results_df)

# IT24100821_SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, roc_auc_score
)
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
data = pd.read_csv('preprocessed_stress_level_dataset.csv')

In [None]:
X = data.iloc[:, :-1]
y = data['stress_level']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
param_grid_linear = {'C': [0.1, 1, 10]}
param_grid_rbf_scale = {'C': [0.1, 1, 10]}
param_grid_rbf_gamma = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

In [None]:
# Initialize StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Linear SVM
model_linear = SVC(kernel='linear', probability=True, random_state=42)
grid_search_linear = GridSearchCV(model_linear, param_grid_linear, cv=cv, scoring='accuracy')
grid_search_linear.fit(X_train, y_train)

# RBF SVM with gamma='scale'
model_rbf_scale = SVC(kernel='rbf', gamma='scale', probability=True, random_state=42)
grid_search_rbf_scale = GridSearchCV(model_rbf_scale, param_grid_rbf_scale, cv=cv, scoring='accuracy')
grid_search_rbf_scale.fit(X_train, y_train)

# RBF SVM with specified gamma
model_rbf_gamma = SVC(kernel='rbf', probability=True, random_state=42)
grid_search_rbf_gamma = GridSearchCV(model_rbf_gamma, param_grid_rbf_gamma, cv=cv, scoring='accuracy')
grid_search_rbf_gamma.fit(X_train, y_train)

In [None]:
# Get the best models and their parameters
best_linear_model = grid_search_linear.best_estimator_
best_linear_params = grid_search_linear.best_params_
print(f"Best parameters for Linear SVM: {best_linear_params}")

best_rbf_scale_model = grid_search_rbf_scale.best_estimator_
best_rbf_scale_params = grid_search_rbf_scale.best_params_
print(f"Best parameters for RBF SVM (gamma='scale'): {best_rbf_scale_params}")

best_rbf_gamma_model = grid_search_rbf_gamma.best_estimator_
best_rbf_gamma_params = grid_search_rbf_gamma.best_params_
print(f"Best parameters for RBF SVM (specified gamma): {best_rbf_gamma_params}")

# Evaluate the best models on the test set
best_models_results = []

# Evaluate Linear SVM
y_pred_linear = best_linear_model.predict(X_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)
precision_linear = precision_score(y_test, y_pred_linear, average='weighted')
recall_linear = recall_score(y_test, y_pred_linear, average='weighted')
f1_linear = f1_score(y_test, y_pred_linear, average='weighted')

lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)
y_pred_proba_linear = best_linear_model.predict_proba(X_test)
auc_linear = roc_auc_score(y_test_bin, y_pred_proba_linear, multi_class='ovr')

cm_linear = confusion_matrix(y_test, y_pred_linear)

best_models_results.append({
    'model': f"Linear SVM (tuned: {best_linear_params})",
    'accuracy': accuracy_linear,
    'precision': precision_linear,
    'recall': recall_linear,
    'f1': f1_linear,
    'auc': auc_linear,
    'confusion_matrix': cm_linear
})

# Evaluate RBF SVM with gamma='scale'
y_pred_rbf_scale = best_rbf_scale_model.predict(X_test)
accuracy_rbf_scale = accuracy_score(y_test, y_pred_rbf_scale)
precision_rbf_scale = precision_score(y_test, y_pred_rbf_scale, average='weighted')
recall_rbf_scale = recall_score(y_test, y_pred_rbf_scale, average='weighted')
f1_rbf_scale = f1_score(y_test, y_pred_rbf_scale, average='weighted')

y_pred_proba_rbf_scale = best_rbf_scale_model.predict_proba(X_test)
auc_rbf_scale = roc_auc_score(y_test_bin, y_pred_proba_rbf_scale, multi_class='ovr')

cm_rbf_scale = confusion_matrix(y_test, y_pred_rbf_scale)

best_models_results.append({
    'model': f"RBF SVM (gamma='scale', tuned: {best_rbf_scale_params})",
    'accuracy': accuracy_rbf_scale,
    'precision': precision_rbf_scale,
    'recall': recall_rbf_scale,
    'f1': f1_rbf_scale,
    'auc': auc_rbf_scale,
    'confusion_matrix': cm_rbf_scale
})

# Evaluate RBF SVM with specified gamma
y_pred_rbf_gamma = best_rbf_gamma_model.predict(X_test)
accuracy_rbf_gamma = accuracy_score(y_test, y_pred_rbf_gamma)
precision_rbf_gamma = precision_score(y_test, y_pred_rbf_gamma, average='weighted')
recall_rbf_gamma = recall_score(y_test, y_pred_rbf_gamma, average='weighted')
f1_rbf_gamma = f1_score(y_test, y_pred_rbf_gamma, average='weighted')

y_pred_proba_rbf_gamma = best_rbf_gamma_model.predict_proba(X_test)
auc_rbf_gamma = roc_auc_score(y_test_bin, y_pred_proba_rbf_gamma, multi_class='ovr')

cm_rbf_gamma = confusion_matrix(y_test, y_pred_rbf_gamma)

best_models_results.append({
    'model': f"RBF SVM (specified gamma, tuned: {best_rbf_gamma_params})",
    'accuracy': accuracy_rbf_gamma,
    'precision': precision_rbf_gamma,
    'recall': recall_rbf_gamma,
    'f1': f1_rbf_gamma,
    'auc': auc_rbf_gamma,
    'confusion_matrix': cm_rbf_gamma
})

# Print results
print("\nEvaluation of Best Tuned Models on Test Set:")
for result in best_models_results:
    print(f"\nModel: {result['model']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Precision: {result['precision']:.4f}")
    print(f"Recall: {result['recall']:.4f}")
    print(f"F1 Score: {result['f1']:.4f}")
    print(f"AUC: {result['auc']:.4f}")
    print("Confusion Matrix:")
    print(result['confusion_matrix'])

In [None]:
# Collect the evaluation results for the best models from each variation
# best_models_results is already available from the previous step

# Iterate through the results and print the evaluation metrics
print("\nComparison of Best Tuned Model Variations:")
for result in best_models_results:
    print(f"\nModel: {result['model']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Precision: {result['precision']:.4f}")
    print(f"Recall: {result['recall']:.4f}")
    print(f"F1 Score: {result['f1']:.4f}")
    print(f"AUC: {result['auc']:.4f}")
    print("Confusion Matrix:")
    print(result['confusion_matrix'])

    # Plot confusion matrix
    artifact_id = str(uuid.uuid4())
    plt.figure(figsize=(8, 6))
    sns.heatmap(result['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {result["model"]}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_tuned_{artifact_id}.png')
    plt.show()


# Determine and print the model with the highest accuracy
best_overall_model = max(best_models_results, key=lambda x: x['accuracy'])

print(f"\nOverall Best Model Based on Accuracy: {best_overall_model['model']}")
print(f"Accuracy: {best_overall_model['accuracy']:.4f}")

IT24100890_Random_forest

In [None]:
# --- 1. Import Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import time

# --- 2. Load and Split the Dataset ---

# Load the preprocessed data
df = pd.read_csv('preprocessed_stress_level_dataset.csv')

# Define features (X) and target (y)
X = df.drop('stress_level', axis=1)
y = df['stress_level']

# Get feature names (as strings, since they are 0, 1, 2...)
feature_names = X.columns.tolist()

# Split the data: 80% training, 20% testing
# We use stratify=y to ensure the class distribution is the same in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original Dataset Shape: {df.shape}")
print(f"Features: {X.shape[1]}, Target: {y.name}")
print(f"Target classes: {np.unique(y)}")
print(f"Training Set Shape: {X_train.shape}, Test Set Shape: {X_test.shape}")

# Dictionary to store test accuracies for final comparison
model_accuracies = {}

In [None]:
# --- 3. Model Variation 1: Baseline Random Forest (Default Parameters) ---
print("--- Training Variation 1: Baseline Random Forest ---")
start_time = time.time()

# Initialize the model with random_state for reproducibility
rf_base = RandomForestClassifier(random_state=42)

# Train the model
rf_base.fit(X_train, y_train)

# Make predictions
y_pred_base = rf_base.predict(X_test)
y_prob_base = rf_base.predict_proba(X_test)

# Evaluate the model
acc_base = accuracy_score(y_test, y_pred_base)
model_accuracies['Baseline (Defaults)'] = acc_base
auc_base = roc_auc_score(y_test, y_prob_base, multi_class='ovr')

print(f"Baseline Model Training Time: {time.time() - start_time:.2f} seconds")
print(f"Baseline Model Test Accuracy: {acc_base:.6f}")
print(f"Baseline Model Test AUC Score (OvR): {auc_base:.6f}")
print("\nClassification Report (Baseline):")
print(classification_report(y_test, y_pred_base))

# Plot Confusion Matrix
cm_base = confusion_matrix(y_test, y_pred_base)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Confusion Matrix - Baseline Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("--------------------------------------------------\n")

In [None]:
# --- 4. Model Variation 2: Manually Tuned (Deeper Trees) ---
print("--- Training Variation 2: Manually Tuned (n_estimators=200, max_depth=20) ---")
start_time = time.time()

# Initialize the model with more trees and a defined max depth
rf_manual = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, n_jobs=-1)

# Train the model
rf_manual.fit(X_train, y_train)

# Make predictions
y_pred_manual = rf_manual.predict(X_test)
y_prob_manual = rf_manual.predict_proba(X_test)

# Evaluate the model
acc_manual = accuracy_score(y_test, y_pred_manual)
model_accuracies['Manual (n=200, depth=20)'] = acc_manual
auc_manual = roc_auc_score(y_test, y_prob_manual, multi_class='ovr')

print(f"Manual Tune Model Training Time: {time.time() - start_time:.2f} seconds")
print(f"Manual Tune Model Test Accuracy: {acc_manual:.6f}")
print(f"Manual Tune Model Test AUC Score (OvR): {auc_manual:.6f}")
print("\nClassification Report (Manual Tune):")
print(classification_report(y_test, y_pred_manual))

# Plot Confusion Matrix
cm_manual = confusion_matrix(y_test, y_pred_manual)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_manual, annot=True, fmt='d', cmap='Greens',
            xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Confusion Matrix - Manual Tune Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("--------------------------------------------------\n")

In [None]:
# --- 5. Model Variation 3: GridSearchCV Tuned Model ---
print("--- Training Variation 3: GridSearchCV Tuned ---")
start_time = time.time()

# Define the parameter grid to search
# This is a small grid to run relatively quickly.
param_grid = {
    'n_estimators': [100, 150],      # Number of trees
    'max_depth': [10, 20, None],     # Max depth of trees
    'min_samples_split': [2, 5],     # Min samples to split a node
    'min_samples_leaf': [1, 2]       # Min samples at a leaf node
}

# Initialize GridSearchCV
# cv=5 means 5-fold cross-validation
# n_jobs=-1 uses all available CPU cores
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy',
                           verbose=1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

print(f"\nGridSearchCV Training Time: {time.time() - start_time:.2f} seconds")

# Get the best model
best_rf = grid_search.best_estimator_
print(f"Best Parameters Found by GridSearchCV:\n{grid_search.best_params_}")

# Make predictions with the best model
y_pred_grid = best_rf.predict(X_test)
y_prob_grid = best_rf.predict_proba(X_test)

# Evaluate the best model
acc_grid = accuracy_score(y_test, y_pred_grid)
model_accuracies['GridSearchCV Tuned'] = acc_grid
auc_grid = roc_auc_score(y_test, y_prob_grid, multi_class='ovr')

print(f"\nGridSearchCV Tuned Model Test Accuracy: {acc_grid:.6f}")
print(f"GridSearchCV Tuned Model Test AUC Score (OvR): {auc_grid:.6f}")
print("\nClassification Report (GridSearchCV Tuned):")
print(classification_report(y_test, y_pred_grid))

# Plot Confusion Matrix
cm_grid = confusion_matrix(y_test, y_pred_grid)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_grid, annot=True, fmt='d', cmap='Oranges',
            xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Confusion Matrix - GridSearchCV Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("--------------------------------------------------\n")

In [None]:
# --- 6. Final Comparison ---

# Print a summary of accuracies
print("\n--- Final Model Accuracy Comparison ---")
accuracy_series = pd.Series(model_accuracies).sort_values(ascending=False)
print(accuracy_series)

# Optional: Feature Importance from the best model
print(f"\n--- Feature Importances (from {accuracy_series.index[0]} model) ---")
# Get the best model (assuming GridSearchCV was best, otherwise change 'best_rf')
# You can change 'best_rf' to 'rf_base' or 'rf_manual' if one of them performed better
best_model_for_features = grid_search.best_estimator_

importances = best_model_for_features.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df.head(10))

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10))
plt.title('Top 10 Feature Importances')
plt.show()

IT24100307_MLP

In [None]:
!pip install -q tensorflow scikit-learn pandas matplotlib seaborn
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
print("TensorFlow version:", tf.__version__)

df = pd.read_csv('preprocessed_stress_level_dataset.csv')
df.head()

In [None]:
# Separate X and y
X = df.drop(columns=['stress_level'])
y = df['stress_level']

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X_train shape:", X_train_scaled.shape, "X_test shape:", X_test_scaled.shape)

In [None]:
# Train a basic MLP model with one hidden layer, reduced neurons, and less epochs

# Use the build_mlp function defined in the previous cell
# Assuming input_dim and n_classes are already defined from data preprocessing
input_dim = X_train_scaled.shape[1]
n_classes = len(np.unique(y_train))

model_reduced = build_mlp(input_dim=input_dim,
                          n_classes=n_classes,
                          hidden_layers=[16],
                          dropout_rate=0.2,
                          lr=1e-3)

# Train the model with fewer epochs
history_reduced = model_reduced.fit(
    X_train_scaled, y_train,
    validation_split=0.15,
    epochs=20,
    batch_size=32,
    verbose=2
)

model_reduced.summary()

In [None]:
# Plot training history
history_reduced_df = pd.DataFrame(history_reduced.history)
history_reduced_df[['loss','val_loss']].plot(title='Loss (Reduced Model)', figsize=(8,4))
plt.xlabel('Epoch')
plt.grid(True)
plt.show()

history_reduced_df[['accuracy','val_accuracy']].plot(title='Accuracy (Reduced Model)', figsize=(8,4))
plt.xlabel('Epoch')
plt.grid(True)
plt.show()

In [None]:
# Evaluate the reduced model on the test set
# Predictions
if n_classes == 2:
    y_proba = model_reduced.predict(X_test_scaled).ravel()
    y_pred = (y_proba >= 0.5).astype(int)
else:
    y_proba = model_reduced.predict(X_test_scaled)
    y_pred = np.argmax(y_proba, axis=1)

print("Test accuracy (Reduced Model):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Reduced Model):\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Reduced Model)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Cell 6: Training
input_dim = X_train_scaled.shape[1]
n_classes = len(np.unique(y_train))

model = build_mlp(input_dim=input_dim,
                  n_classes=n_classes,
                  hidden_layers=[128,64],
                  dropout_rate=0.25, lr=1e-3)

# Callbacks
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.15,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)

model_reduced.summary()

In [None]:
# Cell 7: Plot training history
history_df = pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot(title='Loss', figsize=(8,4))
plt.xlabel('Epoch')
plt.grid(True)
plt.show()

history_df[['accuracy','val_accuracy']].plot(title='Accuracy', figsize=(8,4))
plt.xlabel('Epoch')
plt.grid(True)
plt.show()


In [None]:
# Cell 8: Evaluation
# Predictions
if n_classes == 2:
    y_proba = model.predict(X_test_scaled).ravel()
    y_pred = (y_proba >= 0.5).astype(int)
else:
    y_proba = model.predict(X_test_scaled)
    y_pred = np.argmax(y_proba, axis=1)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
#Manual grid over a few hyperparameters (fast)
results = []
histories = {} # Dictionary to store training histories
best_acc = 0 # Variable to store the best accuracy found
best_model = None # Variable to store the best model

hidden_options = [[64,32],[128,64],[256,128]]
dropouts = [0.0, 0.2]
for hidden in hidden_options:
    for dp in dropouts:
        key = f"Hidden: {hidden}, Dropout: {dp}"
        print(f"Training: {key}")
        m = build_mlp(input_dim=input_dim, n_classes=n_classes, hidden_layers=hidden, dropout_rate=dp, lr=1e-3)
        h = m.fit(X_train_scaled, y_train, validation_split=0.12, epochs=40, batch_size=32, verbose=0,
                  callbacks=[callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)])
        histories[key] = h.history # Store the history dictionary

        # evaluate quickly on test
        if n_classes == 2:
            yp = (m.predict(X_test_scaled).ravel() >= 0.5).astype(int)
        else:
            yp = np.argmax(m.predict(X_test_scaled), axis=1)
        acc = accuracy_score(y_test, yp)
        print("Acc:", acc)
        results.append({"hidden": hidden, "dropout": dp, "acc": acc})

        # Check if this model is the best so far
        if acc > best_acc:
            best_acc = acc
            best_model = m # Store the best model

display(pd.DataFrame(results).sort_values('acc', ascending=False))
model_reduced.summary()

In [None]:
# Plot accuracy and loss graphs for each hyperparameter combination from stored histories side-by-side
if 'histories' not in locals() or not histories:
    print("Error: Training histories not found. Please run the hyperparameter tuning cell first.")
else:
    for key, history_data in histories.items():
        history_df = pd.DataFrame(history_data)

        print(f"Plotting history for: {key}")

        fig, axes = plt.subplots(1, 2, figsize=(10, 3)) # Create a figure with 2 subplots in one row

        # Plot Loss on the first subplot
        history_df[['loss', 'val_loss']].plot(ax=axes[0])
        axes[0].set_title(f"Loss - {key}")
        axes[0].set_xlabel('Epoch')
        axes[0].grid(True)

        # Plot Accuracy on the second subplot
        history_df[['accuracy', 'val_accuracy']].plot(ax=axes[1])
        axes[1].set_title(f"Accuracy - {key}")
        axes[1].set_xlabel('Epoch')
        axes[1].grid(True)

        plt.tight_layout() # Adjust layout to prevent overlapping titles/labels
        plt.show()

In [None]:
# Evaluate the best model on the test set and show confusion matrix / classification report

# Assuming 'best_model' variable is available from the hyperparameter tuning step
if 'best_model' not in locals():
    print("Error: 'best_model' not found. Please run the hyperparameter tuning cell first.")
else:
    print("Evaluating the best hyperparameter tuned model on the test set...")

    if n_classes == 2:
        y_proba_best_eval = best_model.predict(X_test_scaled).ravel()
        y_pred_best_eval = (y_proba_best_eval >= 0.5).astype(int)
    else:
        y_proba_best_eval = best_model.predict(X_test_scaled)
        y_pred_best_eval = np.argmax(y_proba_best_eval, axis=1)

    print("Test accuracy (Best Tuned Model):", accuracy_score(y_test, y_pred_best_eval))
    print("\nClassification Report (Best Tuned Model):\n", classification_report(y_test, y_pred_best_eval))

    # Confusion matrix for the best model
    cm_best_eval = confusion_matrix(y_test, y_pred_best_eval)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm_best_eval, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix (Best Tuned Model)')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [None]:
# Compare the base model, reduced model, and the best hyperparameter tuned model

print("\n--- variety 1 Performance ---")
# Evaluate the reduced model
if 'model_reduced' not in locals():
     print("Error: 'best_model' not found. Please run the hyperparameter tuning cell first.")
else:
    if n_classes == 2:
        y_proba_reduced_compare_zh = model_reduced.predict(X_test_scaled).ravel()
        y_pred_reduced_compare_zh = (y_proba_reduced_compare_zh >= 0.5).astype(int)
    else:
        y_proba_reduced_compare_zh = model_reduced.predict(X_test_scaled)
        y_pred_reduced_compare_zh = np.argmax(y_proba_reduced_compare_zh, axis=1)

    acc_reduced_zh = accuracy_score(y_test, y_pred_reduced_compare_zh)
    report_reduced_zh = classification_report(y_test, y_pred_reduced_compare_zh)

    print(f"Test Accuracy (variety 1): {acc_reduced_zh:.4f}")
    print("\nClassification Report (variety 1):\n", report_reduced_zh)


print("--- variety 2 ---")
# Re-evaluate the base model to ensure comparison is based on the same test set split
# Assuming 'model' variable holds the base model from the initial training with callbacks
if 'model' not in locals():
    print("Error: 'best_model' not found. Please run the hyperparameter tuning cell first.")
else:
    if n_classes == 2:
        y_proba_base_compare_zh = model.predict(X_test_scaled).ravel()
        y_pred_base_compare_zh = (y_proba_base_compare_zh >= 0.5).astype(int)
    else:
        y_proba_base_compare_zh = model.predict(X_test_scaled)
        y_pred_base_compare_zh = np.argmax(y_proba_base_compare_zh, axis=1)

    acc_base_zh = accuracy_score(y_test, y_pred_base_compare_zh)
    report_base_zh = classification_report(y_test, y_pred_base_compare_zh)

    print(f"Test Accuracy (variety 2 - with Callbacks): {acc_base_zh:.4f}")
    print("\nClassification Report (variety 2 - with Callbacks):\n", report_base_zh)


print("\n--- variety 3 ---")
# Re-evaluate the best tuned model to ensure comparison is based on the same test set split
# Assuming 'best_model' variable holds the best tuned model
if 'best_model' not in locals():
     print("Error: 'best_model' not found. Please run the hyperparameter tuning cell first.")
else:
    if n_classes == 2:
        y_proba_best_compare_zh = best_model.predict(X_test_scaled).ravel()
        y_pred_best_compare_zh = (y_proba_best_compare_zh >= 0.5).astype(int)
    else:
        y_proba_best_compare_zh = best_model.predict(X_test_scaled)
        y_pred_best_compare_zh = np.argmax(y_proba_best_compare_zh, axis=1)

    acc_best_zh = accuracy_score(y_test, y_pred_best_compare_zh)
    report_best_zh = classification_report(y_test, y_pred_best_compare_zh)

    print(f"Test Accuracy (variety 3): {acc_best_zh:.4f}")
    print("\nClassification Report (variety 3):\n", report_best_zh)

# Quick comparison summary
print("\n--- Summary Comparison ---")
# Check if variables exist before printing summary
if 'acc_reduced_zh' in locals():
    print(f"variety 1: {acc_reduced_zh:.4f}")
if 'acc_base_zh' in locals():
    print(f"variety 2: {acc_base_zh:.4f}")
if 'acc_best_zh' in locals():
    print(f"variety 3: {acc_best_zh:.4f}")

# Add comparison logic based on which models were evaluated
accuracies = {}
if 'acc_base_zh' in locals():
    accuracies['Base Model'] = acc_base_zh
if 'acc_reduced_zh' in locals():
    accuracies['Reduced Model'] = acc_reduced_zh
if 'acc_best_zh' in locals():
    accuracies['Best Tuned Model'] = acc_best_zh

if accuracies:
    best_model_name = max(accuracies, key=accuracies.get)
    print(f"\nBased on test accuracy, the '{best_model_name}' performed best.")
else:
    print("\nNo models were evaluated for comparison.")