<a href="https://colab.research.google.com/github/KathanrDave/ML-2024-Project-6-Decision_Makers/blob/master/Codes/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Imputation Code

import pandas as pd
from sklearn import linear_model
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

from google.colab import drive
drive.mount('/content/drive')

RSI_verticalJumpData_Season3 = '/content/drive/My Drive/ML/DataSet/dataset.csv'
df=pd.read_csv(RSI_verticalJumpData_Season3)

# From the dataframe only columns with numeric data types are selected in order to impute
numeric_df = df.select_dtypes(include=['number'])

#Using random state for reproducibilty, max_iteration = 10, 10 decision trees for random forest regressor
imp = IterativeImputer(max_iter=10, random_state=0, estimator=RandomForestRegressor(n_estimators=10, random_state=0))
# Fitting the imputer
imp.fit(numeric_df)
# Applying the imputer to the dataframe
imputed_numeric_df = pd.DataFrame(imp.transform(numeric_df), columns=numeric_df.columns)
# Concating the datasets
imputed_df = pd.concat([imputed_numeric_df, df.drop(columns=numeric_df.columns)], axis=1)

# Make sure to preserve the order of the columns
original_columns = df.columns
imputed_df = imputed_df[original_columns]

imputed_df.to_csv('/content/drive/My Drive/ML/ImputedDataset/ImputedDataset.csv', index=False)


In [None]:
# @title Feature Scaling Code
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File paths
input_csv_path = '/content/drive/My Drive/ML/ImputedDataset/ImputedDataset.csv'
output_csv_path = '/content/drive/My Drive/ML/NormalizedDataset/NormalizedData.csv'

# Load dataset
data = pd.read_csv(input_csv_path)

# Extract athlete names column
athlete_names = data.iloc[:, 0]  # Assuming athlete names are in the first column

# Separate features (X)
X = data.iloc[:, 1:]  # Exclude the first column which contains athlete names

# Initialize Min-Max scaler with custom feature range (0 to 10)
scaler = MinMaxScaler(feature_range=(0,1))

# Fit and transform the features
X_normalized = scaler.fit_transform(X)

# Convert the normalized features back to a DataFrame
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)

# Concatenate athlete names column and normalized features
normalized_data = pd.concat([athlete_names, X_normalized_df], axis=1)

# Save the normalized data to a new CSV file on Google Drive
normalized_data.to_csv(output_csv_path, index=False)


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load your CSV file from Google Drive
import pandas as pd
data = pd.read_csv('/content/drive/My Drive/ML/NormalizedDataset/NormalizedData.csv')
# print("Total number of features:", data.shape[1] - 1)
# Drop the 'Athlete' column from the DataFrame
data = data.drop(columns='Athlete')

# Create a synthetic target variable
data['target'] = range(len(data))

# Split your data
from sklearn.model_selection import train_test_split
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost model
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importance
importance = model.feature_importances_

# Sort features by importance
sorted_idx = importance.argsort()

# Print feature importance for non-zero importance features
# print("Features with non-zero importance:")
# for i in sorted_idx:
#     if importance[i] > 0:
#         print(f"{X.columns[i]}: {importance[i]}")

# Plot feature importance
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6)) # Adjust the figure size for better visibility
plt.barh(X.columns[sorted_idx], importance[sorted_idx])
plt.xlabel("XGBoost Feature Importance")
plt.title("Feature Importance")
plt.show()

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

# Function to get highly correlated features
def get_feature_correlation(df, top_n=None, corr_method='spearman',
                            remove_duplicates=True, remove_self_correlations=True):
    corr_matrix_abs = df.corr(method=corr_method).abs()
    corr_matrix_abs_us = corr_matrix_abs.unstack()
    sorted_correlated_features = corr_matrix_abs_us \
        .sort_values(kind="quicksort", ascending=False) \
        .reset_index()

    if remove_self_correlations:
        sorted_correlated_features = sorted_correlated_features[
            (sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
        ]

    if remove_duplicates:
        sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]

    sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation (abs)']

    if top_n:
        return sorted_correlated_features[:top_n]

    return sorted_correlated_features

# Assuming 'data' is your DataFrame
# If your DataFrame is named differently, replace 'data' with the name of your DataFrame

# Set up the matplotlib figure with a larger size
fig, ax = plt.subplots(figsize=(22, 22))

# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap="coolwarm", annot=True, fmt=".2f", ax=ax, cbar_kws={"shrink": .5})

# Set the labels and rotate them for better readability
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
)
ax.set_yticklabels(
    ax.get_yticklabels(),
    rotation=45,
    horizontalalignment='right',
)

# Show the plot
plt.show()

# Display the list of highly correlated features
top_n = 20 # Adjust this to display more or fewer top correlated pairs
highly_correlated_features = get_feature_correlation(data, top_n=top_n)
print("Top", top_n, "highly correlated feature pairs:")
print(highly_correlated_features)


In [None]:
import pandas as pd

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify the path to your CSV file
file_path = '/content/drive/My Drive/ML/NormalizedDataset/NormalizedData.csv'

# Load your CSV file
data = pd.read_csv(file_path)

# List of columns to keep
columns_to_keep = ['Athlete', 'RSI.Mean', 'Respiratory.Rate', 'HRV', 'Sleep.Efficiency....', 'Sleep.Consistency', 'Sleep.Disturbances', 'Recovery']

# Select only the specified columns
selected_data = data[columns_to_keep]

# Specify the path to save the new CSV file
output_file_path = '/content/drive/My Drive/ML/ReducedDataset/SelectedData.csv'

# Save the selected data to a new CSV file
selected_data.to_csv(output_file_path, index=False)

print(f"Data saved to {output_file_path}")

# Print the list of features selected excluding the 'Athlete' column
features_selected = selected_data.columns.tolist()
features_selected.remove('Athlete') # Remove 'Athlete' from the list
print("Features selected:", features_selected)


In [None]:
!pip install pandas
!pip install lime




In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

# Load the data
data = pd.read_csv('/content/SelectedData.csv')

# Assuming all features are already scaled between 0 and 1
# Select the features you want to use for clustering
features = ['RSI.Mean', 'Respiratory.Rate', 'HRV', 'Sleep.Efficiency....', 'Sleep.Consistency', 'Sleep.Disturbances', 'Recovery']

# Initialize a list to store silhouette scores for each number of clusters
silhouette_scores = []

# Loop over the desired number of clusters
for n_clusters in range(2, 8):
    # Apply Gaussian Mixture Clustering on the original data with the current number of clusters
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm.fit(data[features])

    # Predict the clusters
    data['cluster'] = gmm.predict(data[features])

    # Compute the silhouette score
    silhouette_avg = silhouette_score(data[features], data['cluster'])
    silhouette_scores.append(silhouette_avg)

    # Print the silhouette score for the current number of clusters
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg}")

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 8), silhouette_scores, marker='o')
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

# Visualize the clusters for the best number of clusters (you might need to adjust this based on the silhouette scores)
best_n_clusters = silhouette_scores.index(max(silhouette_scores)) + 2 # +2 because the loop starts at 2
gmm = GaussianMixture(n_components=best_n_clusters, random_state=42)
gmm.fit(data[features])
data['cluster'] = gmm.predict(data[features])

# Since we are not using PCA, we cannot directly visualize the clusters in a 2D space.
# Instead, you might consider using other dimensionality reduction techniques or visualization methods suitable for high-dimensional data.

# Print the names of the athletes along with their cluster assignments
for index, row in data.iterrows():
    print(f"Athlete: {row['Athlete']}, Cluster: {row['cluster']}")


In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score

# Load the data
data = pd.read_csv('/content/SelectedData.csv')

# Assuming all features are already scaled between 0 and 1
# Select the features you want to use for clustering
features = ['RSI.Mean', 'Respiratory.Rate', 'HRV', 'Sleep.Efficiency....', 'Sleep.Consistency', 'Sleep.Disturbances', 'Recovery']

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data[features])

# Initialize a list to store silhouette scores for each number of clusters
silhouette_scores = []

# Loop over the desired number of clusters
for n_clusters in range(2, 8):
    # Apply Gaussian Mixture Clustering on the reduced data with the current number of clusters
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm.fit(data_pca)

    # Predict the clusters
    data['cluster'] = gmm.predict(data_pca)

    # Compute the silhouette score
    silhouette_avg = silhouette_score(data_pca, data['cluster'])
    silhouette_scores.append(silhouette_avg)

    # Print the silhouette score for the current number of clusters
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg}")

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 8), silhouette_scores, marker='o')
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

# Visualize the clusters for the best number of clusters (you might need to adjust this based on the silhouette scores)
best_n_clusters = silhouette_scores.index(max(silhouette_scores)) + 2 # +2 because the loop starts at 2
gmm = GaussianMixture(n_components=best_n_clusters, random_state=42)
gmm.fit(data_pca)
data['cluster'] = gmm.predict(data_pca)

plt.scatter(data_pca[:, 0], data_pca[:, 1], c=data['cluster'], cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title(f'Gaussian Mixture Clustering of Athlete Data with {best_n_clusters} clusters')
plt.colorbar(label='Cluster')
plt.show()

# Print the names of the athletes along with their cluster assignments
for index, row in data.iterrows():
    print(f"Athlete: {row['Athlete']}, Cluster: {row['cluster']}")


In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
import lime
import lime.lime_tabular
from sklearn.metrics import silhouette_score



# Load the data
data = pd.read_csv('/content/SelectedData.csv')

# Assuming all features are already scaled between 0 and 1
# Select the features you want to use for clustering
features = ['RSI.Mean', 'Respiratory.Rate', 'HRV', 'Sleep.Efficiency....', 'Sleep.Consistency', 'Sleep.Disturbances', 'Recovery']

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data[features])

# Apply Gaussian Mixture Clustering on the reduced data
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(data_pca)

# Predict the clusters
data['cluster'] = gmm.predict(data_pca)


# Initialize the list to store silhouette scores
silhouette_scores = []

# Compute the silhouette score
silhouette_avg = silhouette_score(data_pca, data['cluster'])
silhouette_scores.append(silhouette_avg)

# Now, you can print the silhouette score
# print(silhouette_scores[0])

# Set print options for better formatting
np.set_printoptions(precision=3, suppress=True)

# Assuming data_pca is a numpy array
# print(data_pca)

# # Visualize the clusters
# plt.scatter(data_pca[:, 0], data_pca[:, 1], c=data['cluster'], cmap='viridis')
# plt.xlabel('First Principal Component')
# plt.ylabel('Second Principal Component')
# plt.title('Gaussian Mixture Clustering of Athlete Data')
# plt.colorbar(label='Cluster')
# plt.show()

# Print the names of the athletes along with their cluster assignments
# for index, row in data.iterrows():
#     print(f"Athlete: {row['Athlete']}, Cluster: {row['cluster']}")

# Note: The surrogate model is trained on the PCA-transformed data and the GMM cluster assignments
surrogate_model = RandomForestClassifier(random_state=42)
surrogate_model.fit(data_pca, data['cluster'])

# Use LIME to explain the surrogate model
explainer = lime.lime_tabular.LimeTabularExplainer(data_pca, feature_names=features, class_names=['Cluster 0', 'Cluster 1', 'Cluster 2'], verbose=True, mode='classification')

# Assuming you want to explain the prediction for the first instance
#  Assuming the index of the instance in data_pca is 9
instance_index = 9

# Access the athlete's name from the original dataset using the index
athlete_name = data.loc[instance_index, 'Athlete']

# Print the athlete's name
print(f"Athlete: {athlete_name}")

# Now, proceed with the LIME explanation as before
exp = explainer.explain_instance(data_pca[instance_index], surrogate_model.predict_proba, num_features=5)
explanations = exp.as_list()

# Print each explanation with its index
for i, explanation in enumerate(explanations, start=1):
    print(f"Explanation {i}: {explanation}")


# Visualize the explanation
exp.show_in_notebook(show_table=True, show_all=False)


In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
import lime
import lime.lime_tabular
from sklearn.metrics import silhouette_score

# Load the data
data = pd.read_csv('/content/SelectedData.csv')

# Assuming all features are already scaled between 0 and 1
# Select the features you want to use for clustering
#
#
#
#
#
features = ['RSI.Mean', 'HRV', 'Recovery','Sleep.Consistency','Sleep.Efficiency....','Respiratory.Rate','Sleep.Disturbances']
silhouette_scores = []
# Apply Gaussian Mixture Clustering on the original data
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(data[features])

# Predict the clusters
data['cluster'] = gmm.predict(data[features])
# Group the data by the 'cluster' column
grouped_data = data.groupby('cluster')

# Iterate over each group and print the athlete names
# for cluster_id, group in grouped_data:
#     print(f"Cluster {cluster_id}:")
#     for index, row in group.iterrows():
#         print(f" Athlete: {row['Athlete']}")
#     print()

    # Compute the silhouette score
silhouette_avg = silhouette_score(data[features], data['cluster'])
silhouette_scores.append(silhouette_avg)
# print(silhouette_scores[0])


# Train the surrogate model on the original data and the GMM cluster assignments
surrogate_model = RandomForestClassifier(random_state=42)
surrogate_model.fit(data[features], data['cluster'])

# Prepare the data for LIME
# Assuming there are no categorical features, but if there are, specify them here
categorical_features = [] # Example: ['feature_name']

# Create the LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    data[features].values,
    feature_names=features,
    class_names=['Cluster 0', 'Cluster 1', 'Cluster 2'],
    verbose=True,
    mode='classification',
    categorical_features=categorical_features # Specify categorical features if any
)

instance_index = 21
# Access the athlete's name from the original dataset using the index
athlete_name = data.loc[instance_index, 'Athlete']

# Print the athlete's name
print(f"Athlete: {athlete_name}")

# Explain the prediction for the first instance
instance = data[features].iloc[instance_index].values.reshape(1, -1)
instance_reshaped = instance.reshape(1, -1) # Reshape to (1, 7)
# Convert to 1D array
instance_1d = np.squeeze(instance_reshaped)
print(instance_1d)
# Ensure the instance is correctly formatted
# assert instance.shape[1] == len(features), "The number of features in the instance does not match the expected number."

# Explicitly handle categorical features if any
# This step is crucial if your dataset contains categorical features
# For demonstration, let's assume there are no categorical features
# If there are, you would need to convert them to numerical values before explaining
# print(instance)
# Proceed with the explanation
exp = explainer.explain_instance(instance_1d, surrogate_model.predict_proba, num_features=len(features))
# Assuming exp.as_list() returns a list of explanations
explanations = exp.as_list()

# Print each explanation with its index
for i, explanation in enumerate(explanations, start=1):
    print(f"Explanation {i}: {explanation}")

# Visualize the explanation
exp.show_in_notebook(show_table=True, show_all=False)


In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
import lime
import lime.lime_tabular
from sklearn.metrics import silhouette_score

# Load the data
data = pd.read_csv('/content/SelectedData.csv')

features = ['RSI.Mean', 'HRV', 'Recovery','Sleep.Consistency','Sleep.Efficiency....','Respiratory.Rate','Sleep.Disturbances']
silhouette_scores = []
# Apply Gaussian Mixture Clustering on the original data
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(data[features])

# Predict the clusters
data['cluster'] = gmm.predict(data[features])




# Group the data by the 'cluster' column
grouped_data = data.groupby('cluster')

# Iterate over each group and print the athlete names
# for cluster_id, group in grouped_data:
#     print(f"Cluster {cluster_id}:")
#     for index, row in group.iterrows():
#         print(f" Athlete: {row['Athlete']}")
#     print()

#     # Compute the silhouette score
silhouette_avg = silhouette_score(data[features], data['cluster'])
silhouette_scores.append(silhouette_avg)
# print(silhouette_scores[0])


# Train the surrogate model on the original data and the GMM cluster assignments
surrogate_model = RandomForestClassifier(random_state=42)
surrogate_model.fit(data[features], data['cluster'])

# Prepare the data for LIME
# Assuming there are no categorical features, but if there are, specify them here
categorical_features = [] # Example: ['feature_name']

# Create the LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    data[features].values,
    feature_names=features,
    class_names=['Cluster 0', 'Cluster 1', 'Cluster 2'],
    verbose=True,
    mode='classification',
    categorical_features=categorical_features # Specify categorical features if any
)
# Loop through each row in the DataFrame
for index, row in data.iterrows():
    # Access the athlete's name from the row
    athlete_name = row['Athlete']

    # Print the athlete's name
    print(f"Athlete: {athlete_name}")

    # Extract the features for the current athlete
    instance = row[features].values.reshape(1, -1)
    instance_1d = np.squeeze(instance )
    # print(instance_1d)
    # Proceed with the explanation
    exp = explainer.explain_instance(instance_1d, surrogate_model.predict_proba, num_features=len(features))

    # Print each explanation with its index
    # explanations = exp.as_list()
    # for i, explanation in enumerate(explanations, start=1):
    #     print(f"Explanation {i}: {explanation}")

    # Visualize the explanation
    exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')