In [None]:
from azure.data.tables import TableServiceClient
import os 
import pandas as pd
import numpy as np
import ast

In [173]:
#　Setup connection to Azure Table Storage by its connection string 
connection_string = os.environ.get("AZURE_TABLE_STORAGE_CONNECTION_STRING")
table_service_client = TableServiceClient.from_connection_string(conn_str=connection_string)

# Get table that stored FYP player data
table_client = table_service_client.get_table_client(table_name="fyptablestorage")

In [174]:
# Get entites from Azure Table and convert to pandas dataframe
entities = table_client.query_entities("PartitionKey eq 'FYP'")
df = pd.DataFrame(entities)

In [175]:
landmark_list = [f"Landmark{i}" for i in range(12, 30) if i not in [17, 18, 19, 20, 21, 22]]
df = df[["Age", "Score", *landmark_list]]

In [176]:
# Function to convert string representation of list to actual list
def str_to_list(landmark_str):
    return ast.literal_eval(landmark_str)


# Function to calculate motion entropy (G(Y)) for each landmark using log base 2
def calculate_motion_entropy(motion_amplitudes):
    Dz = np.sum(motion_amplitudes)
    if Dz == 0:
        return np.nan
    entropy = -np.sum([(D/Dz) * np.log2(D/Dz) for D in motion_amplitudes if D > 0])
    return entropy


# Function to calculate overall entropy features
def calculate_overall_entropy_features(row, landmark_columns):
    entropies = []
    for col in landmark_columns:
        entropy = calculate_motion_entropy(row[col])
        if not np.isnan(entropy):
            entropies.append(entropy)
    
    if len(entropies) == 0:
        return pd.Series([np.nan] + [np.nan] * len(landmark_columns) + [np.nan], 
                         index=["mean_entropy"] + [f"entropy_{col}" for col in landmark_columns] + ["aggregated_entropy"])
    
    mean_entropy = np.mean(entropies)
    
    # Aggregating all motion amplitudes across landmarks and calculating entropy of the aggregated data
    aggregated_motion_amplitudes = []
    for col in landmark_columns:
        aggregated_motion_amplitudes.extend(row[col])
    
    aggregated_Dz = np.sum(aggregated_motion_amplitudes)
    if aggregated_Dz == 0:
        aggregated_entropy = np.nan
    else:
        aggregated_entropy = -np.sum([(D/aggregated_Dz) * np.log2(D/aggregated_Dz) for D in aggregated_motion_amplitudes if D > 0])
    
    return pd.Series([mean_entropy] + entropies + [aggregated_entropy], 
                     index=["mean_entropy"] + [f"entropy_{col}" for col in landmark_columns] + ["aggregated_entropy"])

In [177]:
# Converting all Landmark columns from string to list
for landmark in landmark_list:
    df[landmark] = df[landmark].apply(str_to_list)

# Applying the function to calculate the overall entropy features
entropy_features_df = df.apply(calculate_overall_entropy_features, axis=1, landmark_columns=landmark_list)

In [178]:
test_df = df[["Score"]]
# Combining the entropy features with the original DataFrame
final_df = pd.concat([test_df, entropy_features_df], axis=1)

In [179]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct, ConstantKernel as C
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

In [180]:
X = final_df.drop(columns=["Score"])
y = df["Score"]

In [181]:
model = GaussianProcessRegressor()

# Define the model and hyperparameters to search
model = GaussianProcessRegressor()

# Define the parameter grid with expanded hyperparameters
param_grid = {
    "kernel": [
        C(1.0, (1e-2, 1e2)) * RBF(length_scale, (1e-2, 1e2)) for length_scale in [1, 10, 100]
    ] + [
        C(1.0, (1e-2, 1e2)) * Matern(length_scale, (1e-2, 1e2), nu) for length_scale in [1, 10, 100] for nu in [0.5, 1.5, 2.5]
    ] + [
        C(1.0, (1e-2, 1e2)) * RationalQuadratic(length_scale, alpha) for length_scale in [1, 10, 100] for alpha in [0.1, 1.0, 10.0]
    ] + [
        C(1.0, (1e-2, 1e2)) * ExpSineSquared(length_scale, periodicity) for length_scale in [1, 10, 100] for periodicity in [1.0, 3.0]
    ] + [
        C(1.0, (1e-2, 1e2)) * DotProduct(sigma_0=sigma) for sigma in [0.1, 1.0, 10.0]
    ],
    "alpha": [1e-5, 1e-3, 1e-1, 1.0, 10.0]
}

In [None]:
# Leave-One-Out Cross-Validation
loo = LeaveOneOut()

# Perform GridSearchCV with LOOCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=loo, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X, y)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best parameters
print(f"Best parameters: {best_params}")

In [None]:
# Evaluate the best model using LOOCV
results = {"mse": [], "mae": []}

for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results["mse"].append(mse)
    results["mae"].append(mae)

# Calculate the average performance metrics
avg_mse = np.mean(results["mse"])
avg_mae = np.mean(results["mae"])
avg_rmse = np.mean(results["rmse"])

print(f"GaussianProcessRegressor: Mean Squared Error = {avg_mse:.4f}, Mean Absolute Error = {avg_mae:.4f}")

# Train the best model on the entire dataset
best_model.fit(X, y)

# Save the best model to PKL format
with open(f"{os.path.dirname(os.path.abspath(''))}\\ContainerizedModelFlaskAPI\\depresson_gpr_model.txt", 'wb') as file:
    pickle.dump(best_model, file)

print("Best model exported to depresson_gpr_model.pkl")