In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/150.0 MB 1.5 MB/s eta 0:01:38
   ---------------------------------------- 1.0/150.0 MB 1.5 MB/s eta 0:01:38
   ---------------------------------------- 1.0/150.0 MB 1.5 MB/s eta 0:01:38
   ---------------------------------------- 1.3/150.0 MB 1.2 MB/s eta 0:02:05
   ---------------------------------------- 1.6/150.0 MB 1.2 MB/s eta 0:02:06
   ---------------------------------------- 1.6/150.0 MB 1.2 MB/s eta 0:02:06
   ---------------------------------------- 1.6/150.0 MB 1.2 MB/s eta 0:02:06
   -------------------

In [9]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import os

# Load the dataset
df = pd.read_csv("combined_pose_data.csv")

# Prepare features and labels
X = df.drop(columns=['Label'])
y = df['Label']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# Check class distribution before split
print("Class distribution in full dataset:", pd.Series(y_encoded).value_counts())

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Check class distribution after split
print("Class distribution in training set:", pd.Series(y_train).value_counts())
print("Class distribution in test set:", pd.Series(y_test).value_counts())

# Train XGBoostClassifier with optimized parameters and early stopping
xgb_model = XGBClassifier(
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    random_state=42,
    early_stopping_rounds=10
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_report = classification_report(y_test, xgb_pred, target_names=le.classes_, zero_division=0, output_dict=False)
print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Report:\n", xgb_report)

# Train RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred, target_names=le.classes_, zero_division=0, output_dict=False)
print("RandomForest Accuracy:", rf_accuracy)
print("RandomForest Report:\n", rf_report)

# Save the better model and label encoder
best_model = xgb_model if xgb_accuracy > rf_accuracy else rf_model
joblib.dump(best_model, "exercise_classifier_best.pkl")
joblib.dump(le, "label_encoder.pkl")
print("Model saved as exercise_classifier_best.pkl and label encoder as label_encoder.pkl")

# Create result folder if it doesn't exist
result_dir = "result"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

# Save accuracy scores to a file
with open(os.path.join(result_dir, "accuracy_scores.txt"), "w") as f:
    f.write(f"XGBoost Accuracy: {xgb_accuracy:.4f}\n")
    f.write(f"RandomForest Accuracy: {rf_accuracy:.4f}\n")

# Save classification reports to files
with open(os.path.join(result_dir, "xgboost_report.txt"), "w") as f:
    f.write("XGBoost Classification Report:\n")
    f.write(xgb_report)
with open(os.path.join(result_dir, "randomforest_report.txt"), "w") as f:
    f.write("RandomForest Classification Report:\n")
    f.write(rf_report)

# Plot accuracy metrics
plt.figure(figsize=(8, 6))
models = ['XGBoost', 'RandomForest']
accuracies = [xgb_accuracy, rf_accuracy]
plt.bar(models, accuracies, color=['blue', 'green'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Set y-axis limit from 0 to 1
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.02, f'{v:.4f}', ha='center')
plt.savefig(os.path.join(result_dir, "accuracy_comparison.png"))
plt.close()
print(f"Graph saved as {os.path.join(result_dir, 'accuracy_comparison.png')}")

Label mapping: {'pull Up': 0, 'push-up': 1}
Class distribution in full dataset: 1    2826
0    1300
Name: count, dtype: int64
Class distribution in training set: 1    2260
0    1040
Name: count, dtype: int64
Class distribution in test set: 1    566
0    260
Name: count, dtype: int64
XGBoost Accuracy: 0.9757869249394673
XGBoost Report:
               precision    recall  f1-score   support

     pull Up       0.98      0.95      0.96       260
     push-up       0.98      0.99      0.98       566

    accuracy                           0.98       826
   macro avg       0.98      0.97      0.97       826
weighted avg       0.98      0.98      0.98       826

RandomForest Accuracy: 0.9757869249394673
RandomForest Report:
               precision    recall  f1-score   support

     pull Up       0.98      0.95      0.96       260
     push-up       0.98      0.99      0.98       566

    accuracy                           0.98       826
   macro avg       0.98      0.97      0.97       826