#  SMOTE plus Random Forest on Perceived Stress

### 1. Loading the Data

In [2]:
import pandas as pd

stress_df=pd.read_csv('TanRL, Suba, Mendoza, Valencia - Processed_Responses.csv') 

### 2. Feature Selection

In [3]:
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC, SMOTE
from collections import Counter

# Initial prep
X = stress_df.drop(columns=['Stress_Label'])
y = stress_df['Stress_Label']
X = X.fillna(X.mean())

# 1: Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 2: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=25)

In [4]:
# 3: Selection
log_reg = LogisticRegression(solver='lbfgs', max_iter=500)

# 3.1 RFE (Logistic Regression)
rfe_logreg = RFE(estimator=log_reg, n_features_to_select=10)
rfe_logreg.fit(X, y_encoded)
rfe_features = X.columns[rfe_logreg.support_]
X_rfe_logreg = X[rfe_features]

# 3.2 RFE (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe_rf = RFE(estimator=rf_model, n_features_to_select=10)
rfe_rf.fit(X, y_encoded)
rf_features = X.columns[rfe_rf.support_]
X_rfe_rf = X[rf_features]

# 3.3 Big Five
big_five = ['Commute time (one way): ___ minutes',
            'Been upset because of something that happened unexpectedly?',
            'Felt that you were unable to control the important things in your life?',
            'Felt nervous and "stressed"?',
            'About how often did you feel tired out for no good reason?']
X_big5 = X[big_five]

# 3.4 Chi-Square
chi2_selector = SelectKBest(chi2, k=10)
chi2_selector.fit(X, y_encoded)
chi2_features = X.columns[chi2_selector.get_support()]
X_chi2 = X[chi2_features]

In [5]:
# 4. Split Feature Sets
def split_features(df):
    return train_test_split(df, y_encoded, test_size=0.25, random_state=42)

X_train_all, X_test_all, y_train_all, y_test_all = split_features(X)
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = split_features(X_rfe_logreg)
X_train_rf, X_test_rf, y_train_rf, y_test_rf = split_features(X_rfe_rf)
X_train_big5, X_test_big5, y_train_big5, y_test_big5 = split_features(X_big5)
X_train_chi2, X_test_chi2, y_train_chi2, y_test_chi2 = split_features(X_chi2)   

In [6]:
# Apply SMOTE-NC on each training set
categorical_features = [
    'Working Student:',
    'Studying Home', 'Studying Dorm', 'Studying Cafe', 'Studying Other',
    'Do you consider yourself more of an introvert or an extrovert?',
    'How do you usually cope with stress?'
]
cat_idx = [X.columns.get_loc(col) for col in categorical_features]

def apply_smote(X_train, y_train):
    # Check which categorical columns exist in this subset
    subset_cat_idx = [i for i, col in enumerate(X_train.columns) if col in categorical_features]
    
    # Determine k_neighbors safely
    class_counts = Counter(y_train)
    min_class = min(class_counts.values())
    k_neighbors = min(5, min_class - 1) if min_class > 1 else 1

    # Choose the correct resampler
    if len(subset_cat_idx) > 0:
        smote = SMOTENC(categorical_features=subset_cat_idx, random_state=42, k_neighbors=k_neighbors)
    else:
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)

    return smote.fit_resample(X_train, y_train)


### 3. Training and Evaluation using Random Forest

In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the base Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

def train_evaluate_rf(X_train, X_test, y_train, y_test, label):
    rf_clf.fit(X_train, y_train)
    y_pred = rf_clf.predict(X_test)
    print(f"\n--- {label} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Apply SMOTE and Evaluate
datasets = [
    ("All Features", X_train_all, X_test_all, y_train_all, y_test_all),
    ("RFE (LogReg)", X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe),
    ("RFE (Random Forest)", X_train_rf, X_test_rf, y_train_rf, y_test_rf),
    ("Big Five", X_train_big5, X_test_big5, y_train_big5, y_test_big5),
    ("Chi-Square", X_train_chi2, X_test_chi2, y_train_chi2, y_test_chi2)
]

for label, X_tr, X_te, y_tr, y_te in datasets:
    X_res, y_res = apply_smote(X_tr, y_tr)
    train_evaluate_rf(X_res, X_te, y_res, y_te, label)



--- All Features ---
Accuracy: 0.9411764705882353
Confusion Matrix:
 [[15  0]
 [ 1  1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      0.50      0.67         2

    accuracy                           0.94        17
   macro avg       0.97      0.75      0.82        17
weighted avg       0.94      0.94      0.93        17


--- RFE (LogReg) ---
Accuracy: 0.9411764705882353
Confusion Matrix:
 [[15  0]
 [ 1  1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      0.50      0.67         2

    accuracy                           0.94        17
   macro avg       0.97      0.75      0.82        17
weighted avg       0.94      0.94      0.93        17


--- RFE (Random Forest) ---
Accuracy: 0.9411764705882353
Confusion Matrix:
 [[15  0]
 [ 1  1]]
Classification Report:
 

## Final Model Training & Export Code

In [9]:
import joblib

#### Define Final Features

In [10]:
# We'll use RFE (Random Forest) features as our final set.
final_features = X_rfe_rf.columns
X_final = X[final_features]

# Split the data
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_final, y_encoded, test_size=0.25, random_state=42
)

#### Apply SMOTE and/or SMOTENC

In [12]:
categorical_features = [
    'Working Student:',
    'Studying Home', 'Studying Dorm', 'Studying Cafe', 'Studying Other',
    'Do you consider yourself more of an introvert or an extrovert?',
    'How do you usually cope with stress?'
]

cat_idx_final = [X_final.columns.get_loc(col) for col in categorical_features if col in final_features]

# Safe k_neighbors value
class_counts = Counter(y_train_final)
min_class = min(class_counts.values())
k_neighbors = min(5, min_class - 1) if min_class > 1 else 1

if len(cat_idx_final) > 0:
    smote_final = SMOTENC(categorical_features=cat_idx_final, random_state=42, k_neighbors=k_neighbors)
else:
    smote_final = SMOTE(random_state=42, k_neighbors=k_neighbors)

X_train_resampled, y_train_resampled = smote_final.fit_resample(X_train_final, y_train_final)

#### Train Final Random Forest

In [13]:
rf_final = RandomForestClassifier(n_estimators=100, random_state=42)
rf_final.fit(X_train_resampled, y_train_resampled)

#### Evaluate Final Model

In [14]:
y_pred_final = rf_final.predict(X_test_final)
print("Final Model Accuracy:", accuracy_score(y_test_final, y_pred_final))
print("Confusion Matrix:\n", confusion_matrix(y_test_final, y_pred_final))
print("Classification Report:\n", classification_report(y_test_final, y_pred_final))

Final Model Accuracy: 0.9411764705882353
Confusion Matrix:
 [[15  0]
 [ 1  1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      0.50      0.67         2

    accuracy                           0.94        17
   macro avg       0.97      0.75      0.82        17
weighted avg       0.94      0.94      0.93        17



#### Save Model & Encoder

In [15]:
joblib.dump(rf_final, "rf_model.pkl")
joblib.dump(le, "label_encoder.pkl")
joblib.dump(final_features, "final_features.pkl")

print("Model, encoder, and features saved as rf_model.pkl, label_encoder.pkl, and final_features.pkl")

Model, encoder, and features saved as rf_model.pkl, label_encoder.pkl, and final_features.pkl
