In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load train and test data
data_train = pd.read_csv("../data/perturbed_data.csv")
data_test = pd.read_csv("../data/universal_test_data.csv")

# Separate features and target variable
X_train = data_train.drop(columns=["Physical Activity Status"])  # Drop target column
y_train = data_train["Physical Activity Status"]                 # Target column

X_test = data_test.drop(columns=["Physical Activity Status"])
y_test = data_test["Physical Activity Status"]

In [14]:
# Encode categorical variables (e.g., Gender, Smoking Status)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure train and test datasets have the same columns after encoding
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [15]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

In [16]:
# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.8562
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      3977
           1       0.75      0.47      0.58      1051

    accuracy                           0.86      5028
   macro avg       0.81      0.71      0.75      5028
weighted avg       0.85      0.86      0.84      5028

Confusion Matrix:
[[3812  165]
 [ 558  493]]


In [17]:
# Get feature importance
feature_importances = rf_model.feature_importances_
feature_importances_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

print(feature_importances_df)

                     Feature  Importance
2                        Age    0.166493
5                        BMI    0.158656
0                 Unnamed: 0    0.086935
4       Income Poverty Ratio    0.080739
13     Unhealthy Food Intake    0.056429
3             Household Size    0.054688
12       Healthy Food Intake    0.053781
11            Protein Intake    0.053696
10         Restaurant Visits    0.052300
15          Milk Consumption    0.050133
1                     Gender    0.050131
14      Beverage Consumption    0.046300
9      General Health Status    0.039214
8             Smoking Status    0.019661
6              Diet Question    0.019127
7   Diet Question Annotation    0.011717
