In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
import seaborn as sns


In [None]:
df = pd.read_csv('gym_members_exercise_tracking.csv')
display(df)
display(df.columns)

In [None]:
# show distribution of calories burned
sns.histplot(df['Calories_Burned'])



In [None]:
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})

In [None]:
display(df['Workout_Type'].unique())

df = pd.get_dummies(df, columns=['Workout_Type'])

In [None]:
def labelCalories(amount):
    if 0 <= amount < 500:
        return "Small"
    elif 500 <= amount < 1200:
        return "Medium"
    else:
        return "Large"

In [None]:
df['Calories'] = df['Calories_Burned'].apply(labelCalories)

# standerdize the data
# scaler = StandardScaler()
# scaler.fit(df.drop('Calories', axis=1))

# scaled_features = scaler.transform(df.drop('Calories', axis=1))
# scaled_features_df = pd.DataFrame(scaled_features, columns=df.columns[:-1])

# scaled_features_df.drop(['Calories_Burned'], axis=1, inplace=True)

# display(scaled_features_df)

In [None]:
cols = df.columns.tolist()
cols.remove('Calories')

corr = df[cols].corr(min_periods=15) #  'Max_BPM',  'Experience_Level', 
sns.heatmap(corr)
# create a sorted list of columns based on absolute value correlation with calories
corr = corr['Calories_Burned'].apply(abs).sort_values(ascending=False)

corr.drop('Calories_Burned', inplace=True)

display(corr)

In [None]:
# split data into x and y
df.drop('Calories_Burned', axis=1, inplace=True)
X = df.drop(['Calories'], axis=1)
y = df['Calories']


In [None]:
# train desiscion tree and visulize it

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np

def test_dTree(depth, features_num, tests, print_report=False):

    total_predictions = []
    total_y_test = []

    for i in range(tests):

        dtree = DecisionTreeClassifier(max_depth=depth)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        X_train = X_train[corr[:features_num].index]
        X_test = X_test[X_train.columns]

        dtree.fit(X_train, y_train)

        predictions = dtree.predict(X_test)

        total_predictions.extend(predictions)
        total_y_test.extend(y_test)

    report = classification_report(total_y_test, total_predictions, output_dict=True, zero_division=0)

    if print_report:
        print(classification_report(total_y_test, total_predictions))

    return report


def plot_tree(dtree, features):
    fig = plt.figure(figsize=(25,20))

    _ = tree.plot_tree(dtree,
                        feature_names=features,
                        class_names=['Small', 'Medium', 'Large'],
                        filled=True)

    plt.show()



In [None]:
tests = 1 # changed from 100 so its will run faster

max_depth = 17
max_features = len(X.columns)

f1_scores = np.zeros((max_depth, max_features))

for i in range(max_depth):
    for j in range(max_features):
        depth = i + 1
        feature_num = j + 1

        report = test_dTree(depth, feature_num, tests)

        print(f"Depth: {depth}, Features: {feature_num}: {report['weighted avg']['f1-score']}")

        f1_scores[i][j] = report['weighted avg']['f1-score']


plt.show()

In [None]:


sns.heatmap(f1_scores, fmt=".1f",)

best_score = np.max(f1_scores)
best_score_index = np.where(f1_scores == best_score)



plt.gca().add_patch(plt.Rectangle((best_score_index[1][0], best_score_index[0][0]), 1, 1, fill=False, edgecolor='red', lw=3))


plt.xlabel('Top N Features by correlation')
plt.ylabel('Max Depth')
plt.title('F1 Score tests for Decision Tree on test data')

In [None]:
# bias the f1 scores withe a combination of teh niumber of features and the depth then plot again

f1_scores_bias = np.zeros((max_depth, max_features))

for i in range(max_depth):
    for j in range(max_features):
        depth = i + 1
        feature_num = j + 1

        f1_scores_bias[i][j] = f1_scores[i][j] - .1*((feature_num/max_features)*(depth/max_depth))

sns.heatmap(f1_scores_bias, fmt=".1f",)

best_score_b = np.max(f1_scores_bias)
best_score_index_b = np.where(f1_scores_bias == best_score_b)



plt.gca().add_patch(plt.Rectangle((best_score_index_b[1][0], best_score_index_b[0][0]), 1, 1, fill=False, edgecolor='red', lw=3))


plt.xlabel('Top N Features sorted by correlation')
plt.ylabel('Max Depth')
plt.title('F1 Score tests for Decision Tree on test data')

In [None]:
print(f"Best F1 Score: {best_score}, Depth: {best_score_index[0][0] + 1}, Features: {best_score_index[1][0] + 1}")
print(f"Best F1 Score with bias: {best_score_b}, Depth: {best_score_index_b[0][0] + 1}, Features: {best_score_index_b[1][0] + 1}")

print(f"unbiased f1 score: {f1_scores[best_score_index_b[0][0]][best_score_index_b[1][0]]}")

In [None]:
# create tree with best depth adn features

test_dTree(best_score_index_b[0][0] + 1, best_score_index_b[1][0] + 1, 100, True)

dtree = DecisionTreeClassifier(max_depth=best_score_index_b[0][0] + 1)

dtree.fit(X, y)

plot_tree(dtree, X.columns)

