## Modelling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import phik
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
regressors = [
    KNeighborsRegressor(), 
    DecisionTreeRegressor(), 
    RandomForestRegressor(), 
    GradientBoostingRegressor()
]

models = []
scores = []

for regressor in regressors:
    steps = [
        ('preprocess', preprocessor),
        ('reg', regressor)
    ]
    pipeline = Pipeline(steps)
    scorer = cross_val_score(
        pipeline, 
        X_train, 
        y_train, 
        cv=5,
        scoring='neg_mean_squared_error', 
        n_jobs=-1
    )
    models.append(str(regressor))
    scores.append(scorer.mean())

plt.figure(figsize=(10,5))
plt.barh(models, scores)
plt.show();

In [None]:
loss = ['quantile', 'squared_error', 'absolute_error', 'huber']
max_features = ['sqrt', 'log2', None]
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 300, num = 15)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 15)]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 15)]

hyperparameter_grid = {
    'reg__loss': loss,
    'reg__max_features': max_features,
    'reg__n_estimators': n_estimators,
    'reg__max_depth': max_depth,
    'reg__min_samples_split': min_samples_split,
    'reg__min_samples_leaf': min_samples_leaf
}

random_cv = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=hyperparameter_grid,
    cv=3,
    n_iter=200,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
    random_state=state
)

random_cv.fit(X_train, y_train)

print()
print('Best params:')
print(random_cv.best_params_)
print()
print('Best score:', random_cv.best_score_)

In [None]:
rs_df = pd.DataFrame(random_cv.cv_results_).sort_values('rank_test_score').reset_index(drop=True)
rs_df.loc[rs_df['param_reg__max_features'].isna(), 'param_reg__max_features'] = 'None'

cols = [
    'param_reg__loss', 
    'param_reg__max_features',
    'param_reg__n_estimators',
    'param_reg__max_depth',
    'param_reg__min_samples_split',
    'param_reg__min_samples_leaf'
]
pref = 'param_reg__'

fig, axs = plt.subplots(ncols=2, nrows=3)
fig.set_size_inches(30,25)
sns.set(font_scale=2)
color = 'lightblue'
i = 0
j = 0

for col in cols:
    sns.barplot(
        x=col,
        y='mean_test_score', 
        data=rs_df, 
        ax=axs[i,j], 
        color=color
    )
    axs[i,j].set_title(
        label=col.replace(pref, ''), 
        size=30, 
        weight='bold'
    )
    axs[i,j].set_xlabel('')
    j += 1
    if j == 2:
        i += 1
        j = 0

In [None]:
loss = ['absolute_error']
max_features = ['log2', None]
n_estimators = range(45, 55)
max_depth = range(1, 6)
min_samples_split = range(2, 9, 2)
min_samples_leaf = range(2, 9, 2)

hyperparameter_grid = {
    'reg__loss': loss,
    'reg__max_features': max_features,
    'reg__n_estimators': n_estimators,
    'reg__max_depth': max_depth,
    'reg__min_samples_split': min_samples_split,
    'reg__min_samples_leaf': min_samples_leaf
}

grid_cv = GridSearchCV(
    estimator=pipeline,
    param_grid=hyperparameter_grid,
    cv=3, 
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_cv.fit(X_train, y_train)
best_params = grid_cv.best_params_

print()
print('Best params:')
print(best_params)
print()
print('Best score:', grid_cv.best_score_)

In [None]:
import pandas as pd

# Concatenate df1 and the selected columns from df2 horizontally
df3= pd.concat([df1, df2], axis=1)

# Fill NaN values with appropriate values if needed
#df1.fillna({'Weather Conditions': 'Unknown', 'Calories Group': 'Unknown', 'Age group': 'Unknown', 'Weight Category': 'Unknown'}, inplace=True)
df3

In [None]:
df3.drop(['Age', 'Calories Burn', 'BMI', 'Duration'], axis=1)


In [None]:
display(check_missing_values(df3))

In [None]:
df4 = pd.get_dummies(df3,columns={'Gender','Weather Conditions','Exercise', 'Weight Category','minute duration', 'Age group', 'Calories group'},drop_first=True)

In [None]:
display(check_missing_values(df4))

In [None]:
df4.info()

## Train and Test

In [None]:
#Importing necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
#import xgboost as xgb
from sklearn.metrics import accuracy_score 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import  roc_curve
from sklearn.metrics import  roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel 
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
x = df4.drop('Exercise Intensity',axis=1)
y = df4['Exercise Intensity']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=11)

In [None]:
print(X_train.shape)  # print (n_train_samples, n_features)
print(X_test.shape)   # print (n_test_samples, n_features)
print(y_train.shape)  # print (n_train_samples,)
print(y_test.shape)   # print (n_test_samples,)

In [None]:
#Instantiate Standard Scaler
scaler = StandardScaler()

# Fit and transform train and test set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create a DataFrame from the scaled training data and display the first few rows of the scaled training data DataFrame
scaled_data_train = pd.DataFrame(X_train_scaled , columns=X_train.columns)
scaled_data_train.head()

## Modeling
1. Linear Regression
2. Random Forest
3. XGB boost
4. 

## Random Forest

In [None]:
# Instantiate and fit the model
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train_scaled, y_train)  

In [None]:
# Get the column names of the features in the training data
labels = list(X_train.columns)

In [None]:
# Plot feature importances
n_features = X_train_scaled.shape[1]
plt.figure(figsize=(20,50))
plt.barh(range(n_features), rfc.feature_importances_, align='center') 
plt.yticks(np.arange(n_features),labels=labels)
plt.title('Feature Imporance', fontsize=30, pad=5)
plt.xlabel('Feature importance', fontsize=20, labelpad=5)
plt.ylabel('Features', fontsize=20)
plt.tight_layout()

In [None]:
#select features based on their importance scores using the mean value of feature importances
selected_features = X_train.columns[rfc.feature_importances_ > rfc.feature_importances_.mean()]
print(selected_features)

In [None]:
# filter the feature importance scores based on their values, selecting only the scores that are above the mean
scores = rfc.feature_importances_
selected_features_scores = scores[rfc.feature_importances_ > rfc.feature_importances_.mean()]
labels_selected = list(selected_features)

In [None]:
# Plot feature importances
n_features = len(selected_features)

# Sort the selected features and their scores in ascending order
sorted_indices = np.argsort(selected_features_scores)
sorted_features = np.array(labels_selected)[sorted_indices]
sorted_scores = selected_features_scores[sorted_indices]

plt.figure(figsize=(20, 50))
plt.barh(range(n_features), sorted_scores, align='center')
plt.yticks(np.arange(n_features), labels=sorted_features)
plt.title('Feature Importances', fontsize=30, pad=15)
plt.xlabel('Feature Importance', fontsize=20, labelpad=5)
plt.ylabel('Features', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
def evaluate_classification(model, X_train_transformed, X_test_transformed, y_train, y_test, classes=None, normalize='true', cmap='Blues_r', label='', save_dir='plots'):

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # retrieve predictions for train and test data
    y_pred_train = model.predict(X_train_transformed)
    y_pred_test = model.predict(X_test_transformed)

    # print training classification report
    header = label + " CLASSIFICATION REPORT TRAINING "
    dashes = "---" * 20
    print(dashes, header, dashes, sep='\n')
    print(classification_report(y_train, y_pred_train, target_names=classes))

    # calculate confusion matrix for training data
    cm_train = confusion_matrix(y_train, y_pred_train)
    cm_train_norm = cm_train / cm_train.sum(axis=1)[:, np.newaxis] if normalize == 'true' else cm_train

    # print testing classification report
    header_ = label + " CLASSIFICATION REPORT TESTING "
    print(dashes, header_, dashes, sep='\n')
    print(classification_report(y_test, y_pred_test, target_names=classes))

    # calculate confusion matrix for testing data
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_test_norm = cm_test / cm_test.sum(axis=1)[:, np.newaxis] if normalize == 'true' else cm_test

    # Create a combined figure for training and testing plots
    fig, axes = plt.subplots(figsize=(12, 4), ncols=4)

    # plot confusion matrix for training data
    sns.heatmap(cm_train_norm, annot=True, fmt='.2f', cmap=cmap, ax=axes[0])
    axes[0].set(title='Confusion Matrix Training', xlabel='Predicted Labels', ylabel='True Labels')

    # plot ROC curve for training data
    fpr_train, tpr_train, _ = roc_curve(y_train, model.predict_proba(X_train_transformed)[:, 1])
    roc_auc_train = roc_auc_score(y_train, model.predict_proba(X_train_transformed)[:, 1])
    axes[1].plot(fpr_train, tpr_train, label=f'AUC = {roc_auc_train:.2f}')
    axes[1].plot([0, 1], [0, 1], ls=':')
    axes[1].set(xlabel='False Positive Rate', ylabel='True Positive Rate',
                title='Receiver Operating Characteristic Training')
    axes[1].legend(loc='lower right')

    # plot confusion matrix for testing data
    sns.heatmap(cm_test_norm, annot=True, fmt='.2f', cmap=cmap, ax=axes[2])
    axes[2].set(title='Confusion Matrix Testing', xlabel='Predicted Labels', ylabel='True Labels')

    # plot ROC curve for testing data
    fpr_test, tpr_test, _ = roc_curve(y_test, model.predict_proba(X_test_transformed)[:, 1])
    roc_auc_test = roc_auc_score(y_test, model.predict_proba(X_test_transformed)[:, 1])
    axes[3].plot(fpr_test, tpr_test, label=f'AUC = {roc_auc_test:.2f}')
    axes[3].plot([0, 1], [0, 1], ls=':')
    axes[3].set(xlabel='False Positive Rate', ylabel='True Positive Rate',
                title='Receiver Operating Characteristic Testing')
    axes[3].legend(loc='lower right')

    # Adjust spacing between subplots
    plt.tight_layout(pad=2.0)

    # Save combined plots
    plt.savefig(os.path.join(save_dir, 'combined_plots.png'))

    plt.show()

In [None]:
# Instantiate and fit the model
rf = RandomForestClassifier(n_estimators=100, max_depth= 5)
rf.fit(X_train_scaled, y_train)  

In [None]:
import os
# Call the pred_score function with Random forest classifier 
evaluate_classification(rf,X_train_scaled, X_test_scaled, y_train, y_test, label = 'Random Forest')

## Pipelines

In [None]:
df1.info()

In [None]:
state = 42
cat_features = [
    'Exercise', 
    'Gender', 
    'Weather Conditions'
]

num_features = [
    'Dream Weight', 
    'Actual Weight', 
    'Age', 
    'Duration', 
    'Heart Rate', 
    'BMI', 
    
]

target = 'Exercise Intensity'

X_train, X_test, y_train, y_test = train_test_split(
    df1[cat_features+num_features], 
    df1[target], 
    test_size=0.33, 
    random_state=state
)

In [None]:
# Pipeline stuff
# adding imputer in case future df updates will be with NaNs

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('normalizer', Normalizer())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant')),
        ('encoder', OrdinalEncoder())
    ]
)

preprocessor = ColumnTransformer(
   transformers=[
       ('numeric', numeric_transformer, num_features),
       ('categorical', categorical_transformer, cat_features)
   ]
)

In [None]:
loss = ['absolute_error']
max_features = ['log2', None]
n_estimators = range(45, 55)
max_depth = range(1, 6)
min_samples_split = range(2, 9, 2)
min_samples_leaf = range(2, 9, 2)

hyperparameter_grid = {
    'reg__loss': loss,
    'reg__max_features': max_features,
    'reg__n_estimators': n_estimators,
    'reg__max_depth': max_depth,
    'reg__min_samples_split': min_samples_split,
    'reg__min_samples_leaf': min_samples_leaf
}

grid_cv = GridSearchCV(
    estimator=pipeline,
    param_grid=hyperparameter_grid,
    cv=3, 
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_cv.fit(X_train, y_train)
best_params = grid_cv.best_params_

print()
print('Best params:')
print(best_params)
print()
print('Best score:', grid_cv.best_score_)

In [None]:
pipeline.set_params(**best_params)

pipeline.fit(X_train, y_train)
y_pred = np.round(pipeline.predict(X_test))
print(f'MSE for test subset: {mean_squared_error(y_test, y_pred)}')