# Qualitative Modeling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
from kneed import KneeLocator
from sklearn.decomposition import PCA
import math
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
qual_map = pd.read_csv("qual_map.csv")

qual_map_loaded = {key: group.drop(columns=["key"]) for key, group in qual_map.groupby("key")}

qual_map = qual_map_loaded

In [None]:
print(qual_map.keys())

In [None]:
candidate_labels = ['Playing Ability','Competitive', 'Character', 'Team Player', 'Leadership', 'Passion', 
                    'Body Language', 'Selfless', 'Low Ego', 'Loyalty', 'Teamwork', 'Trustworthy', 'Dependable', 'Integrity', 'Honest',
                   'Maturity', 'Responsible', 'Positive', 'Confident', 'Adaptive', 'Work Ethic', 'Driven', 'Resilient', 'Effort']

In [None]:
conjunctions = [
    'for', 'and', 'nor', 'but', 'or', 'yet', 'so',
    'because', 'since', 'as', 'seeing that', 'inasmuch as', 'now that', 'considering that',
    'when', 'whenever', 'while', 'after', 'before', 'until', 'as soon as', 'as long as', 'once',
    'if', 'unless', 'provided that', 'assuming that', 'in case', 'even if', 'supposing that',
    'although', 'though', 'even though', 'whereas', 'despite the fact that',
    'so that', 'in order that', 'lest',
    'just as', 'as if', 'as though', 'than',
    'such that',
    'either', 'or', 'neither', 'nor', 'both', 'and', 'not only', 'but also', 'whether',
    'just as', 'so', 'as much', 'as', 'no sooner', 'than', 'rather', 'the more', 'the more',
    'moreover', 'furthermore', 'besides', 'in addition', 'not to mention',
    'however', 'on the other hand', 'nevertheless', 'nonetheless', 'conversely',
    'therefore', 'consequently', 'thus', 'as a result', 'hence',
    'then', 'thereafter', 'subsequently', 'meanwhile', 'at the same time',
    'otherwise',
    'likewise', 'similarly'
]

In [None]:
def apply_scouting_analysis(nfl_data, candidate_labels, conjunctions):
    
    for pos, df in nfl_data.items():
        
        if 'scouting' in df.columns:
            print(f'Applying sentiment analysis for position: {pos} (Data= {len(df)})')

            df['scouting'] = df['scouting'].fillna('').astype(str)

            df_results = df['scouting'].apply(lambda text: analyze_scouting_report(text, candidate_labels, conjunctions))

            df_expanded = df_results.apply(pd.Series)

            expected_categories = candidate_labels
            for col in expected_categories:
                if col not in df_expanded.columns:
                    df_expanded[col] = 0
                    
            df.drop(columns=['scouting'], inplace=True)

            nfl_data[pos] = pd.concat([df, df_expanded], axis=1)


    return nfl_data



In [None]:
player_id_mapping = pd.read_csv("player_id_mapping.csv")

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

roberta_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

distilbert_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
def analyze_scouting_report(scouting_report, candidate_labels, conjunctions):

    if not isinstance(scouting_report, str) or scouting_report.strip() == "":
        return {label: 0 for label in candidate_labels}

    sentences = [s.strip() for s in scouting_report.split('.') if s.strip()] 

    def split_by_conjunctions(sentence, conjunctions):
        for conj in conjunctions:
            if conj in sentence:
                return [part.strip() for part in sentence.split(conj) if part.strip()]
        return [sentence.strip()]

    relevance_scores = {}

    for sentence in sentences:
        fragments = split_by_conjunctions(sentence, conjunctions)

        fragments = [frag for frag in fragments if len(frag.split()) >= 3]

        for fragment in fragments:
            try:
                result = classifier(fragment, candidate_labels)
                relevance_scores[fragment] = dict(zip(result['labels'], result['scores']))
            except ValueError as e:
                print('Relevance score issue') 

    grouped_fragments = {label: [] for label in candidate_labels}
    threshold = 0.2

    for fragment, scores in relevance_scores.items():
        for label, score in scores.items():
            if score > threshold:
                grouped_fragments[label].append(fragment.strip())

    sentiment_results = {}

    for category, fragments in grouped_fragments.items():
        sentiment_scores = []

        for fragment in fragments:
            try:
                roberta_sentiment = roberta_analyzer(fragment)
                roberta_confidence = roberta_sentiment[0]['score']

                distilbert_sentiment = distilbert_analyzer(fragment)
                distilbert_label = distilbert_sentiment[0]['label']
                distilbert_score = distilbert_sentiment[0]['score']

                if distilbert_label == 'POSITIVE':
                    combined_score = distilbert_score * roberta_confidence
                elif distilbert_label == 'NEGATIVE':
                    combined_score = -distilbert_score * roberta_confidence
                else:
                    combined_score = 0

                sentiment_scores.append(combined_score)

            except Exception as e:
                print('Issue with sentiment analysis')

        sentiment_results[category] = sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0

    return sentiment_results

In [None]:
qual_map = apply_scouting_analysis(qual_map, candidate_labels, conjunctions)

In [None]:
from sklearn.preprocessing import MinMaxScaler

for position, df in qual_map.items():
    if all(label in df.columns for label in candidate_labels):
        scaler = MinMaxScaler()
        df[candidate_labels] = scaler.fit_transform(df[candidate_labels])
    
    qual_map[position] = df


In [None]:
print(qual_map['QB'])

# Open Competiton

In [None]:
def select_important_features(lib, candidate_labels, top_n=3, random_seed=42):
    np.random.seed(random_seed)
    feature_selected_lib = {}

    for pos, value in lib.items():
        df = value['DataFrame']

        player_ids = df['player_id']

        df_no_id = df.drop(columns=['player_id', 'cluster'], errors='ignore')

        random_labels = np.random.randint(0, 2, size=len(df_no_id))

        # Random Forest Classifier for identifying important features
        rf = RandomForestClassifier(n_estimators=100, random_state=random_seed)
        rf.fit(df_no_id, random_labels)
        feature_importances = rf.feature_importances_

        importance_df = pd.DataFrame({'Feature': df_no_id.columns, 'Importance': feature_importances})
        importance_df = importance_df.sort_values(by='Importance', ascending=False)
        selected_features = importance_df['Feature'][:top_n].tolist()

        df_selected = df[selected_features].copy()
        df_selected['player_id'] = player_ids

        feature_selected_lib[pos] = {'DataFrame': df_selected, 'Optimal_k': value['Optimal_k']}

        print(f"Position: {pos} - Selected Features: {selected_features}")

    return feature_selected_lib

In [None]:
def get_elbow(pos_mapping):
    def elbow(df, name, ax):
        
        player_ids = df["player_id"]

        df_no_id = df.drop(columns=["player_id"])
        
        inertias = []
        cluster_range = range(2, min(len(df_no_id), 15))
        
        for k in cluster_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(df_no_id)
            inertias.append(kmeans.inertia_)
    
        knee_locator = KneeLocator(cluster_range, inertias, curve="convex", direction="decreasing")
        optimal_k = knee_locator.knee

        # If no optimal k set to 3
        if optimal_k is None:
            optimal_k = 3

        ax.plot(cluster_range, inertias, marker='o')
        ax.axvline(x=optimal_k, color="r", linestyle="--", label=f"Optimal k={optimal_k}")
        ax.set_title(f'Elbow Method {name}')
        ax.set_xlabel('Number of Clusters')
        ax.set_ylabel('Inertia')
        ax.legend()

        return optimal_k, player_ids

    qualitative = {}
    
    num_positions = len(pos_mapping)
    rows = (num_positions // 3) + 1
    cols = 3
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    axes = axes.flatten()
    
    for i, (pos, df) in enumerate(pos_mapping.items()):
        opt_k, player_ids = elbow(df, pos, axes[i])

        df['player_id'] = player_ids
        qualitative[pos] = {'DataFrame': df, 'Optimal_k': opt_k}

    plt.tight_layout()
    plt.show()

    return qualitative

qualitative = get_elbow(qual_map)

In [None]:
filtered_qualitative = select_important_features(qualitative, candidate_labels, top_n=3)

In [None]:
def clustering(lib):
    
    def evaluate_kmeans(params, data):
        model = KMeans(**params)
        labels = model.fit_predict(data)
        score = silhouette_score(data, labels)
        return score

    for pos, value in lib.items():
        df = value['DataFrame']
        k = value['Optimal_k']

        player_ids = df['player_id']
        df_no_id = df.drop(columns=['player_id'])  

        param_grid = {
            'n_clusters': [k],
            'init': ['k-means++', 'random'],
            'max_iter': [50, 100, 300], 
            'random_state': [42]
        }

        param_grid = ParameterGrid(param_grid)
        best_params = None
        best_score = -1

        # GridSearch to find optimal parameters
        for params in param_grid:
            score = evaluate_kmeans(params, df_no_id)
            if score > best_score:
                best_score = score
                best_params = params

        optimal_kmeans = KMeans(**best_params)
        cluster_labels = optimal_kmeans.fit_predict(df_no_id)

        df['cluster'] = cluster_labels.astype(str)
        df['player_id'] = player_ids  

        lib[pos]['DataFrame'] = df

    return {pos: info['DataFrame'] for pos, info in lib.items()}

qual_cluster= clustering(filtered_qualitative)

In [None]:
def visualize_clusters_with_table(qualitative, player_id_mapping):
    
    for pos, df in qualitative.items():
        df = df.reset_index()
        df = df.merge(player_id_mapping, on='player_id', how='left')

        df['cluster'] = df['cluster'].astype(int)
        df['cluster'] = pd.Categorical(df['cluster'], categories=sorted(df['cluster'].unique()), ordered=True)

        pca = PCA(n_components=2)
        pca_features = pca.fit_transform(
            df.drop(columns=['cluster', 'player_name', 'pos_abbr', 'player_id'], errors='ignore')
        )

        # Add PCA features back to the DataFrame
        df['PCA1'] = pca_features[:, 0]
        df['PCA2'] = pca_features[:, 1]

        sorted_clusters = sorted(df['cluster'].unique())

        plt.figure(figsize=(10, 6))
        scatter_plot = sns.scatterplot(
            data=df,
            x='PCA1',
            y='PCA2',
            hue='cluster',
            palette='viridis',
            s=100,
            alpha=0.7,
            hue_order=sorted_clusters
        )
        plt.title(f'Cluster Visualization of {pos} (PCA)', fontsize=16)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')

        cluster_info = []
        grouped = df.groupby('cluster', observed=True)
        for cluster, group in grouped:
            cluster_text = [f'Cluster {cluster}"] + [f"{row['player_name']} ({row['pos_abbr']})' for _, row in group.iterrows()]
            cluster_info.append(cluster_text)

        max_rows_per_column = 20
        flattened_table = []
        for cluster_text in cluster_info:
            flattened_table.extend(cluster_text)
            flattened_table.append('')

        num_columns = math.ceil(len(flattened_table) / max_rows_per_column)
        table_data = [
            flattened_table[i * max_rows_per_column:(i + 1) * max_rows_per_column]
            for i in range(num_columns)
        ]

        max_col_length = max(len(column) for column in table_data)
        table_data = [
            column + [''] * (max_col_length - len(column)) for column in table_data
        ]

        table_ax = plt.gcf().add_axes([0.1, -0.4, 0.8, 0.3])
        table_ax.axis('off')
        table = table_ax.table(
            cellText=list(zip(*table_data)),
            cellLoc='left',
            loc='center',
        )
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.auto_set_column_width(col=list(range(len(table_data))))

        # Show the plot
        plt.subplots_adjust(bottom=0.12)
        plt.show()


In [None]:
visualize_clusters_with_table(qual_cluster, player_id_mapping)

In [None]:
for pos, df in qual_cluster.items():
    qual_cluster[pos] = qual_cluster[pos].merge(player_id_mapping, on='player_id', how='left')

In [None]:
import pickle

with open('qual_assignments.pkl', 'wb') as f:
    pickle.dump(qual_cluster, f)



qual_cluster_df = pd.concat([df.assign(position=pos) for pos, df in qual_cluster.items()])
quant_cluster_df.to_csv('qual_assignments_list.csv', index = False)



qual_cluster_df = pd.concat([df.assign(position=pos) for pos, df in qual_cluster.items()])
qual_cluster_df = qual_cluster_df[['player_id', 'cluster', 'position', 'player_name', 'draft_year']]

# Save to CSV
qual_cluster_df.to_csv('qual_assignments.csv', index=False)

# 