In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Load train and test data
data_train = pd.read_csv("../data/universal_train_data.csv")
data_test = pd.read_csv("../data/universal_test_data.csv")

# Separate features and target variable
X_train = data_train.drop(columns=["Physical Activity Status"])  # Drop target column
y_train = data_train["Physical Activity Status"]                 # Target column

X_test = data_test.drop(columns=["Physical Activity Status"])
y_test = data_test["Physical Activity Status"]

In [6]:
# Encode categorical variables (e.g., Gender, Smoking Status)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure train and test datasets have the same columns after encoding
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [7]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

In [8]:
# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.8294
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      3977
           1       0.65      0.39      0.49      1051

    accuracy                           0.83      5028
   macro avg       0.75      0.67      0.69      5028
weighted avg       0.81      0.83      0.81      5028

Confusion Matrix:
[[3761  216]
 [ 642  409]]


In [9]:
# Get feature importance
feature_importances = rf_model.feature_importances_
feature_importances_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

print(feature_importances_df)

                     Feature  Importance
1                        Age    0.197160
4                        BMI    0.175963
3       Income Poverty Ratio    0.095053
12     Unhealthy Food Intake    0.064573
11       Healthy Food Intake    0.063801
10            Protein Intake    0.059561
2             Household Size    0.057499
9          Restaurant Visits    0.057493
14          Milk Consumption    0.054057
13      Beverage Consumption    0.047649
0                     Gender    0.046848
8      General Health Status    0.039172
7             Smoking Status    0.018542
5              Diet Question    0.014520
6   Diet Question Annotation    0.008109
