# Quant Modeling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
from kneed import KneeLocator
from sklearn.decomposition import PCA
import math
from sklearn.ensemble import RandomForestClassifier
import pickle

In [None]:
quant_map = pd.read_csv('quant_map.csv')

quant_map_loaded = {key: group.drop(columns=['key']) for key, group in quant_map.groupby('key')}

quant_map = quant_map_loaded

In [None]:
player_id_mapping = pd.read_csv('player_id_mapping.csv')

In [None]:
def get_elbow(pos_mapping):
    def elbow(df, name, ax):
        player_ids = df['player_id"']
        df_no_id = df.drop(columns=['player_id'])

        inertias = []
        cluster_range = range(2, min(len(df_no_id), 15))
        
        for k in cluster_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(df_no_id)  
            inertias.append(kmeans.inertia_)

        knee_locator = KneeLocator(cluster_range, inertias, curve="convex", direction="decreasing")
        optimal_k = knee_locator.knee

        # if no optimal_k set to 3
        if optimal_k is None:
            optimal_k = 3

        ax.plot(cluster_range, inertias, marker='o')
        ax.axvline(x=optimal_k, color="r", linestyle="--", label=f"Optimal k={optimal_k}")
        ax.set_title(f'Elbow Method {name}')
        ax.set_xlabel('Number of Clusters')
        ax.set_ylabel('Inertia')
        ax.legend()
    
        return optimal_k, player_ids

    quantitative = {}
    
    num_positions = len(pos_mapping)
    rows = (num_positions // 3) + 1
    cols = 3
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    axes = axes.flatten() 
    
    for i, (pos, df) in enumerate(pos_mapping.items()):
        opt_k, player_ids = elbow(df, pos, axes[i])
        df['player_id'] = player_ids  
        quantitative[pos] = {'DataFrame': df, 'Optimal_k': opt_k}

    plt.tight_layout()
    plt.show()

    return quantitative

quantitative = get_elbow(quant_map)


In [None]:
def select_important_features(lib, top_n=3, random_seed = 42):
    feature_selected_lib = {}
    np.random.seed(random_seed)

    for pos, value in lib.items():
        df = value['DataFrame']

        player_ids = df['player_id']
        df_no_id = df.drop(columns=['player_id'])

        random_labels = np.random.randint(0, 2, size=len(df_no_id))  

        # Random forst to find important features
        rf = RandomForestClassifier(n_estimators=100, random_state=random_seed)
        rf.fit(df_no_id, random_labels)
        feature_importances = rf.feature_importances_

        importance_df = pd.DataFrame({'Feature': df_no_id.columns, 'Importance': feature_importances})
        importance_df = importance_df.sort_values(by='Importance', ascending=False)

        selected_features = importance_df['Feature'][:top_n].tolist()

        df_selected = df[selected_features].copy()
        df_selected['player_id'] = player_ids

        feature_selected_lib[pos] = {'DataFrame': df_selected, 'Optimal_k': value['Optimal_k']}
        
        print(f'Position: {pos} - Selected Features: {selected_features}')

    return feature_selected_lib

filtered_quantitative = select_important_features(quantitative, top_n=3)

In [None]:
def clustering(lib):
    def evaluate_kmeans(params, data):
        model = KMeans(**params)
        labels = model.fit_predict(data)
        score = silhouette_score(data, labels)
        return score

    for pos, value in lib.items():
        df = value['DataFrame']
        k = value['Optimal_k']

        player_ids = df['player_id']
        df_no_id = df.drop(columns=['player_id'])  

        param_grid = {
            'n_clusters': [k],
            'init': ['k-means++', 'random'],
            'max_iter': [50, 100, 300], 
            'random_state': [42]
        }

        param_grid = ParameterGrid(param_grid)
        best_params = None
        best_score = -1

        # Grid Search for hyperparameter tuning 
        for params in param_grid:
            score = evaluate_kmeans(params, df_no_id)
            if score > best_score:
                best_score = score
                best_params = params

        optimal_kmeans = KMeans(**best_params)
        cluster_labels = optimal_kmeans.fit_predict(df_no_id)

        # reattaching cluster labels and player_id
        df["cluster"] = cluster_labels.astype(str)
        df["player_id"] = player_ids  

        lib[pos]['DataFrame'] = df

    return {pos: info['DataFrame'] for pos, info in lib.items()}

quant_cluster= clustering(filtered_quantitative)

In [None]:
def visualize_clusters_with_table(quantitative, player_id_mapping):
    
    for pos, df in quantitative.items():

        df = df.reset_index()
        df = df.merge(player_id_mapping, on='player_id', how='left')

        df['cluster'] = df['cluster'].astype(int)
        df['cluster'] = pd.Categorical(df['cluster'], categories=sorted(df['cluster'].unique()), ordered=True)


        pca = PCA(n_components=2)
        pca_features = pca.fit_transform(
            df.drop(columns=['cluster', 'player_name', 'pos_abbr', 'player_id'], errors='ignore')
        )

        # Add PCA features back to the DataFrame
        df['PCA1'] = pca_features[:, 0]
        df['PCA2'] = pca_features[:, 1]


        sorted_clusters = sorted(df['cluster'].unique())

        plt.figure(figsize=(10, 6))
        scatter_plot = sns.scatterplot(
            data=df,
            x='PCA1',
            y='PCA2',
            hue='cluster',
            palette='viridis',
            s=100,
            alpha=0.7,
            hue_order=sorted_clusters
        )
        
        plt.title(f'Cluster Visualization of {pos} (PCA)', fontsize=16)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')


        cluster_info = []
        grouped = df.groupby('cluster', observed = True)
        for cluster, group in grouped:
            cluster_text = [f"Cluster {cluster}"] + [f"{row['player_name']} ({row['pos_abbr']})" for _, row in group.iterrows()]
            cluster_info.append(cluster_text)

        max_rows_per_column = 20
        flattened_table = []
        for cluster_text in cluster_info:
            flattened_table.extend(cluster_text)
            flattened_table.append("")


        num_columns = math.ceil(len(flattened_table) / max_rows_per_column)
        table_data = [
            flattened_table[i * max_rows_per_column:(i + 1) * max_rows_per_column]
            for i in range(num_columns)
        ]

        max_col_length = max(len(column) for column in table_data)
        table_data = [
            column + [''] * (max_col_length - len(column)) for column in table_data
        ]


        table_ax = plt.gcf().add_axes([0.1, -0.4, 0.8, 0.3])
        table_ax.axis('off')
        table = table_ax.table(
            cellText=list(zip(*table_data)),
            cellLoc='left',
            loc='center',
        )
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.auto_set_column_width(col=list(range(len(table_data))))

        plt.subplots_adjust(bottom=0.12)
        plt.show()



In [None]:
visualize_clusters_with_table(quant_cluster, player_id_mapping)

In [None]:
for pos, df in quant_cluster.items():
    quant_cluster[pos] = quant_cluster[pos].merge(player_id_mapping, on='player_id', how = 'left')

In [None]:
print(quant_cluster)

In [None]:
with open("quant_assignments.pkl", "wb") as f:
    pickle.dump(quant_cluster, f)

quant_cluster_df = pd.concat([df.assign(position=pos) for pos, df in quant_cluster.items()])
quant_cluster_df.to_csv("quant_assignments_list.csv", index = False)