## Import relevent packages

In [1]:
# First, ensure that any previous installations or upgrades are reflected in the current kernel
# by using the `pip` magic command.

def upgrade_pip():
    """
    Upgrades pip to the latest version to ensure compatibility and access to the latest packages.
    """
    try:
        print("Upgrading pip to the latest version...")
        # Use the pip magic command to upgrade pip
        !{sys.executable} -m pip install --upgrade pip
        print("Successfully upgraded pip.\n")
    except Exception as upgrade_error:
        print(f"Failed to upgrade pip: {upgrade_error}")
        print("Continuing with existing pip version.\n")

def install_packages(package_list):
    """
    Attempts to install the specified packages using pip.
    If pip fails, it retries using pip3.

    Parameters:
        package_list (list): A list of package names to install.
    """
    try:
        print(f"Attempting to install packages {package_list} using pip...")
        # Use the pip magic command to install packages
        !{sys.executable} -m pip install {" ".join(package_list)}
        print(f"Successfully installed packages using pip.\n")
    except Exception as pip_error:
        print(f"pip installation failed: {pip_error}")
        print(f"Attempting to install packages {package_list} using pip3...")
        try:
            # Attempt to install using pip3
            !pip3 install {" ".join(package_list)}
            print(f"Successfully installed packages using pip3.\n")
        except Exception as pip3_error:
            print(f"pip3 installation failed: {pip3_error}")
            print(f"Failed to install packages using both pip and pip3.")
            raise RuntimeError("Package installation failed.") from pip3_error

def verify_imports(import_statements):
    """
    Attempts to import each specified module and function to verify successful installation.

    Parameters:
        import_statements (list): A list of import statements as strings.
    """
    print("Verifying package installations by importing them...")
    for stmt in import_statements:
        try:
            exec(stmt)
            print(f"Successfully executed: {stmt}")
        except ImportError as import_error:
            print(f"Failed to execute '{stmt}': {import_error}")
            raise ImportError(f"Import failed for statement: {stmt}") from import_error
        except Exception as e:
            print(f"An error occurred while executing '{stmt}': {e}")
            raise
    print("All specified imports executed successfully.\n")

def setUpEnvironment():
    # List of packages to install
    packages_to_install = [
        "basketball-reference-scraper",
        "pandas",
        "numpy",
        "scikit-learn",
    ]

    # List of import statements to verify installations
    import_statements = [
        "from basketball_reference_scraper.teams import get_roster, get_team_stats, get_opp_stats, get_roster_stats, get_team_misc",
        "from basketball_reference_scraper.players import get_stats, get_game_logs",
        "import pandas as pd",
        "import sklearn",
        "from datetime import timedelta",
        "from datetime import datetime"
    ]

    # Upgrade pip before attempting installations
    upgrade_pip()

    # Install the required packages
    install_packages(packages_to_install)

    # # Verify installations by executing import statements
    # verify_imports(import_statements)


In [2]:
setUpEnvironment()
from basketball_reference_scraper.players import get_stats, get_game_logs
from basketball_reference_scraper.teams import get_roster, get_team_stats, get_opp_stats, get_roster_stats, get_team_misc
import pandas as pd
from datetime import timedelta, datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from tqdm import tqdm
import logging
from contextlib import redirect_stdout, redirect_stderr
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
MAGENTA = "\033[95m"
CYAN = "\033[96m"
RESET = "\033[0m"


Upgrading pip to the latest version...
zsh:1: parse error near `-m'
Successfully upgraded pip.

Attempting to install packages ['basketball-reference-scraper', 'pandas', 'numpy', 'scikit-learn'] using pip...
zsh:1: parse error near `-m'
Successfully installed packages using pip.



## Example import statement for a players game by game stats for a given season

In [45]:
def clear_jupyter():
    clear_output(wait=True)


import pandas as pd

def get_player_season(player_name, season_year: int, include_playoffs: bool, position: int):
    # Scrape for data
    df = pd.DataFrame(get_game_logs(player_name, season_year, playoffs=include_playoffs))
    clear_jupyter()
    df.to_csv("tester.csv", index=False)
    
    # Choose the columns you are interested in
    columns_to_keep = ["DATE", "PTS", "AST", "TRB"]
    df_selected = df.loc[:, columns_to_keep]
    
    # Convert PTS, AST, TRB to numeric, coercing errors to NaN
    df_numeric = df_selected[['PTS', 'AST', 'TRB']].apply(pd.to_numeric, errors='coerce')
    
    # Create a mask where all PTS, AST, TRB are not NaN (i.e., are numeric)
    mask = df_numeric.notnull().all(axis=1)
    
    # Apply the mask to filter out rows with non-numeric values
    df_cleaned = df_selected[mask].reset_index(drop=True)
    
    # Add the player's name and position using .loc
    df_cleaned.loc[:, 'Player'] = player_name
    df_cleaned.loc[:, 'Position'] = position
    
    return df_cleaned


In [4]:
def collect_player_stats(players, target_year, include_playoffs=False, output_file="MVP_stats.csv"):
    """
    Collects season statistics for a list of players with logging and progress indicators,
    and saves the combined data to a CSV file.

    Parameters:
    - players (list of str): List of player names to fetch stats for.
    - target_year (int): The target year for which to fetch stats.
    - include_playoffs (bool): Whether to include playoff stats. Default is False.
    - output_file (str): The filename for the output CSV. Default is "MVP_stats.csv".

    Returns:
    - pd.DataFrame: Combined DataFrame containing all players' stats.
    """
    
    # ANSI escape codes for green color
    GREEN = "\033[92m"
    RESET = "\033[0m"
    
    # Define a custom bar format with green color for the progress bar
    custom_bar_format = (
        "{l_bar}"
        f"{GREEN}"  # Start green color
        "{bar}"
        f"{RESET}"  # Reset color
        "| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
    )
    
    # Initialize a list to store individual player DataFrames
    mvp_stats_list = []

    # Iterate over each player with a green progress bar
    for intTicker, player in tqdm(
        enumerate(players, start=1),
        total=len(players),
        desc="Fetching Player Stats",
        bar_format=custom_bar_format
    ):
        try:
            # Fetch the player's season statistics
            player_stats = get_player_season(player, target_year, include_playoffs, intTicker)
            # Append the player's stats to the list
            mvp_stats_list.append(player_stats)
            
            # Log the successful fetch
            logging.info(f"Successfully fetched stats for {player}.")
        
        except Exception as e:
            # Log any errors encountered
            logging.error(f"Error fetching stats for {player}: {e}")
            continue  # Skip to the next player in case of an error

    # Concatenate all player stats into a single DataFrame
    if mvp_stats_list:
        try:
            mvp_stats = pd.concat(mvp_stats_list, ignore_index=True)
            # Save the combined DataFrame to a CSV file
            mvp_stats.to_csv(output_file, index=False)
            logging.info(f"Combined statistics saved to {output_file}.")
        except Exception as e:
            logging.error(f"Error saving DataFrame to CSV: {e}")
            mvp_stats = pd.DataFrame()  # Return an empty DataFrame in case of failure
    else:
        mvp_stats = pd.DataFrame()
        logging.warning("No player stats were fetched. Returning an empty DataFrame.")

    return mvp_stats


In [5]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
import pandas as pd
from tqdm import tqdm

def normalize_player_stats(df, method='min-max'):
    """
    Normalizes the 'PTS', 'AST', and 'TRB' columns in the DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing player statistics.
    - method (str): Normalization method - 'min-max', 'z-score', or 'max-abs'.

    Returns:
    - pd.DataFrame: DataFrame with normalized 'PTS', 'AST', and 'TRB' columns.
    """
    # ANSI escape codes for blue color
    BLUE = "\033[94m"
    RESET = "\033[0m"
    
    # Define a custom bar format with blue color for the progress bar
    custom_bar_format = (
        "{l_bar}"
        f"{GREEN}"  # Start blue color
        "{bar}"
        f"{RESET}"  # Reset color
        " | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
    )
    
    # Define the columns to normalize
    columns_to_normalize = ['PTS', 'AST', 'TRB']
    
    # Check if the necessary columns exist in the DataFrame
    for col in columns_to_normalize:
        if col not in df.columns:
            raise KeyError(f"Column '{col}' not found in DataFrame.")
    
    # Choose the normalization method
    if method == 'min-max':
        scaler = MinMaxScaler()
    elif method == 'z-score':
        scaler = StandardScaler()
    elif method == 'max-abs':
        scaler = MaxAbsScaler()
    else:
        raise ValueError("Unsupported normalization method. Choose 'min-max', 'z-score', or 'max-abs'.")
    
    # Iterate over each column with a blue progress bar
    for col in tqdm(
        columns_to_normalize, 
        desc="Normalizing Columns", 
        unit="column",
        bar_format=custom_bar_format
    ):
        # Reshape the data for the scaler and overwrite the column with normalized values
        df[col] = scaler.fit_transform(df[[col]])
    
    return df


In [6]:
import os
from datetime import timedelta
import pandas as pd
from tqdm import tqdm

def separate_by_date(df, save_paths=None):
    """
    Separates the DataFrame into three equal-duration date ranges, prints the divisions,
    and optionally saves each division to specified file paths with a progress bar.

    Parameters:
    df (pd.DataFrame): The input DataFrame with a 'DATE' column.
    save_paths (list of str, optional): A list of three file paths to save the divisions.
                                        The list should contain exactly three strings.
                                        Example: ['division1.csv', 'division2.csv', 'division3.csv']

    Returns:
    tuple: A tuple containing three DataFrames corresponding to the three date divisions.
    """
    
    # ANSI escape codes for red color
    RED = "\e[0;36m"
    RESET = "\033[0m"
    
    # Define a custom bar format with red color for the progress bar
    custom_bar_format = (
        "{l_bar}"
        f"{GREEN}"  # Start red color
        "{bar}"
        f"{RESET}"  # Reset color
        "| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
    )
    
    # Ensure the 'DATE' column is in datetime format
    df['DATE'] = pd.to_datetime(df['DATE'])
    
    # Find the minimum and maximum dates in the DataFrame
    min_date = df['DATE'].min()
    max_date = df['DATE'].max()
    
    # Calculate the total number of days in the range
    total_days = (max_date - min_date).days + 1  # +1 to include both start and end dates
    
    # Calculate the number of days per division
    days_per_division = total_days // 3
    remainder_days = total_days % 3  # To handle cases where total_days is not perfectly divisible by 3
    
    # Define the end dates for each division
    first_end = min_date + timedelta(days=days_per_division - 1)
    second_end = first_end + timedelta(days=days_per_division)
    
    # Distribute the remainder_days
    if remainder_days == 1:
        second_end += timedelta(days=1)
    elif remainder_days == 2:
        first_end += timedelta(days=1)
        second_end += timedelta(days=1)
    
    # Define the three date ranges
    division1 = (df['DATE'] >= min_date) & (df['DATE'] <= first_end)
    division2 = (df['DATE'] > first_end) & (df['DATE'] <= second_end)
    division3 = (df['DATE'] > second_end) & (df['DATE'] <= max_date)
    
    # Create separate DataFrames for each division
    df_division1 = df[division1].reset_index(drop=True)
    df_division2 = df[division2].reset_index(drop=True)
    df_division3 = df[division3].reset_index(drop=True)
    
    # Print the divisions
    print("Date Divisions:")
    print(f"Division 1: {min_date.date()} to {first_end.date()}")
    print(f"Division 2: {first_end.date() + timedelta(days=1)} to {second_end.date()}")
    print(f"Division 3: {second_end.date() + timedelta(days=1)} to {max_date.date()}")
    
    # If save_paths is provided, save each division to the respective path with a progress bar
    if save_paths:
        if not isinstance(save_paths, list):
            raise TypeError("save_paths must be a list of three file path strings.")
        if len(save_paths) != 3:
            raise ValueError("save_paths must contain exactly three file path strings.")
        
        # Initialize tqdm progress bar for saving divisions with red color
        print("\nSaving Divisions to CSV Files:")
        for i, (division_df, path) in enumerate(tqdm(
            zip([df_division1, df_division2, df_division3], save_paths), 
            total=3, 
            desc="Saving Divisions", 
            unit="file",
            bar_format=custom_bar_format
        ), start=1):
            division_df.to_csv(path, index=False)
            print(f"Division {i} saved to {path}")
    
    return df_division1, df_division2, df_division3


  RED = "\e[0;36m"


In [7]:
# ### 2024 example

# players = [
#     "Nikola Jokić",
#     "Shai Gilgeous-Alexander",
#     "Luka Dončić",
#     "Giannis Antetokounmpo",
#     "Jalen Brunson",
#     "Jayson Tatum",
#     "Anthony Edwards",
#     "Domantas Sabonis",
#     "Kevin Durant"
# ]

# target_year = 2024
# include_playoffs = False

# ###

# mvp_stats = collect_player_stats(players, target_year, include_playoffs, output_file="MVP_stats_DIRTY.csv")
# mvp_stats = normalize_player_stats(mvp_stats, 'min-max')
# temp1, temp2 , temp3 = separate_by_date(mvp_stats, ["first.csv", "second.csv", "third.csv"])



Fetching Player Stats: 100%|[92m██████████[0m| 9/9 [00:58<00:00]
Normalizing Columns: 100%|[92m██████████[0m | 3/3 [00:00<00:00]


Date Divisions:
Division 1: 2023-10-24 to 2023-12-20
Division 2: 2023-12-21 to 2024-02-16
Division 3: 2024-02-17 to 2024-04-14

Saving Divisions to CSV Files:


Saving Divisions: 100%|[92m██████████[0m| 3/3 [00:00<00:00]

Division 1 saved to first.csv
Division 2 saved to second.csv
Division 3 saved to third.csv





In [8]:
# # Define features and target
# X = mvp_stats[['PTS', 'AST', 'TRB']]
# y = mvp_stats['Position']

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.3, random_state=42, stratify=y
# )

# # Initialize the Logistic Regression model
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# # Train the model
# model.fit(X_train, y_train)


In [9]:
# # Make predictions
# y_pred = model.predict(X_test)

# # Calculate accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print(f"\nModel Accuracy: {accuracy:.2f}")

# # Confusion Matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# print("\nConfusion Matrix:")
# print(conf_matrix)

# # Classification Report
# class_report = classification_report(y_test, y_pred)
# print("\nClassification Report:")
# print(class_report)



Model Accuracy: 0.41

Confusion Matrix:
[[ 6  1  0  2  1  0  0 14  0]
 [ 0  1  2  1 11  2  3  0  3]
 [ 2  0 14  1  2  0  1  1  0]
 [ 5  0  1  5  1  2  1  7  0]
 [ 0  0  1  0 19  0  2  1  0]
 [ 0  0  1  3  3  7  4  0  4]
 [ 0  0  0  4  5  0 10  0  5]
 [ 0  0  1  1  0  1  1 21  0]
 [ 1  2  2  0  5  1  8  1  2]]

Classification Report:
              precision    recall  f1-score   support

           1       0.43      0.25      0.32        24
           2       0.25      0.04      0.07        23
           3       0.64      0.67      0.65        21
           4       0.29      0.23      0.26        22
           5       0.40      0.83      0.54        23
           6       0.54      0.32      0.40        22
           7       0.33      0.42      0.37        24
           8       0.47      0.84      0.60        25
           9       0.14      0.09      0.11        22

    accuracy                           0.41       206
   macro avg       0.39      0.41      0.37       206
weighted avg  

In [10]:
# # After training the model, extract the weights (coefficients)
# weights = model.coef_
# intercepts = model.intercept_

# # Print the weights and intercepts
# print("Weights (coefficients):", weights)
# print("Intercepts:", intercepts)

# # Calculate the sum of the absolute values of the coefficients for each feature
# feature_importance = abs(weights).sum(axis=0)
# importance_dict = dict(zip(['PTS', 'AST', 'TRB'], feature_importance))

# # Display the importance of each feature
# print("Feature importance:")
# for feature, importance in importance_dict.items():
#     print(f"{feature}: {importance}")


# # Calculate the average of the absolute values of the coefficients for each feature
# feature_importance_avg = abs(weights).mean(axis=0)
# importance_avg_dict = dict(zip(['PTS', 'AST', 'TRB'], feature_importance_avg))

# # Display the average importance of each feature
# print("Average feature importance:")
# for feature, importance in importance_avg_dict.items():
#     print(f"{feature}: {importance}")

# # Normalize the importance to get percentages
# total_importance = feature_importance.sum()
# feature_importance_percentage = {
#     feature: (importance / total_importance) * 100
#     for feature, importance in importance_dict.items()
# }

# # Display the percentage contribution of each feature
# print("Feature importance as percentages:")
# for feature, percentage in feature_importance_percentage.items():
#     print(f"{feature}: {percentage:.2f}%")



Weights (coefficients): [[-0.05057482  2.12475742  3.16140349]
 [ 0.91066737 -0.24337766 -2.3828792 ]
 [ 2.27328011  2.95990346  0.37550413]
 [ 1.26566077 -0.80514859  2.59703798]
 [ 0.40833222  0.2199028  -4.58580151]
 [-0.6149669  -2.11328301  0.09396716]
 [-0.39411159 -1.12377577 -2.72326125]
 [-3.73065206  1.04004159  4.47757914]
 [-0.0676351  -2.05902022 -1.01354993]]
Intercepts: [-1.82471956  0.57598704 -2.19922444 -0.97901659  1.18644618  1.07884647
  1.52360808 -0.56611492  1.20418774]
Feature importance:
PTS: 9.715880930050394
AST: 12.689210524295387
TRB: 21.41098380270971
Average feature importance:
PTS: 1.079542325561155
AST: 1.4099122804772652
TRB: 2.3789982003010786
Feature importance as percentages:
PTS: 22.17%
AST: 28.96%
TRB: 48.87%


In [11]:

def scale_and_sum_stats(df, weights):
    """
    Scales the 'PTS', 'AST', and 'TRB' columns by the provided percentage weights and creates
    a new column with the weighted sum.

    Parameters:
    - df (pd.DataFrame): DataFrame containing 'PTS', 'AST', 'TRB', and other columns.
    - weights (dict): A dictionary with keys 'PTS', 'AST', and 'TRB' and their corresponding weights (in percentage).

    Returns:
    - pd.DataFrame: DataFrame with an additional column 'Weighted_Sum'.
    """
    # Ensure the weights are in decimal form (e.g., 0.2 for 20%)
    weights = {k: v / 100 for k, v in weights.items()}

    # Scale each column by its respective weight
    df['PTS_Weighted'] = df['PTS'] * weights['PTS']
    df['AST_Weighted'] = df['AST'] * weights['AST']
    df['TRB_Weighted'] = df['TRB'] * weights['TRB']
    
    # Create a new column that is the sum of the weighted stats
    df['Weighted_Sum'] = df['PTS_Weighted'] + df['AST_Weighted'] + df['TRB_Weighted']

    return df


In [12]:
# scaled_mvp_stats = scale_and_sum_stats(mvp_stats, feature_importance_percentage)
# scaled_mvp_stats.to_csv("scaled_mvp_values.csv", index=False)

In [13]:
# firstSection, secondSection , thirdSection = separate_by_date(scaled_mvp_stats, ["first_adjusted.csv", "second_adjusted.csv", "third_adjusted.csv"])

Date Divisions:
Division 1: 2023-10-24 to 2023-12-20
Division 2: 2023-12-21 to 2024-02-16
Division 3: 2024-02-17 to 2024-04-14

Saving Divisions to CSV Files:


Saving Divisions: 100%|[92m██████████[0m| 3/3 [00:00<00:00]

Division 1 saved to first_adjusted.csv
Division 2 saved to second_adjusted.csv
Division 3 saved to third_adjusted.csv





In [28]:
import pandas as pd

def summarize_player_performance(data: pd.DataFrame, sort_by_weighted_sum: bool = False, ascending: bool = False) -> pd.DataFrame:
    """
    Summarizes player performance by calculating the average Weighted_Sum for each player.
    Optionally sorts the summarized data by Weighted_Sum.
    
    Parameters:
    - data (pd.DataFrame): Input DataFrame with columns ['player', 'Position', 'Weighted_Sum']
    - sort_by_weighted_sum (bool): Whether to sort the summarized data by 'Weighted_Sum'. Default is False.
    - ascending (bool): Sort order. True for ascending, False for descending. Default is False.
    
    Returns:
    - pd.DataFrame: Summarized DataFrame with columns ['Player', 'Position', 'Weighted_Sum']
      Sorted by 'Weighted_Sum' if sort_by_weighted_sum is True.
    """
    # Validate that necessary columns exist
    required_columns = {'Player', 'Position', 'Weighted_Sum'}
    if not required_columns.issubset(data.columns):
        missing = required_columns - set(data.columns)
        raise ValueError(f"The following required columns are missing from the input data: {missing}")
    
    # Handle missing values in key columns
    data_clean = data.dropna(subset=['Player', 'Position', 'Weighted_Sum'])
    
    # Ensure Weighted_Sum is numeric
    data_clean['Weighted_Sum'] = pd.to_numeric(data_clean['Weighted_Sum'], errors='coerce')
    data_clean = data_clean.dropna(subset=['Weighted_Sum'])
    
    # Group by player and Position to compute the average Weighted_Sum
    summary = data_clean.groupby(['Player', 'Position'], as_index=False)['Weighted_Sum'].mean()
    
    # Rename 'player' to 'Player' to match desired output
    summary.rename(columns={'Player': 'Player'}, inplace=True)
    
    # Sort the summarized DataFrame if requested
    if sort_by_weighted_sum:
        summary = summary.sort_values(by='Weighted_Sum', ascending=ascending).reset_index(drop=True)
    
    return summary


def process_three_sections(
    section1: pd.DataFrame,
    section2: pd.DataFrame,
    section3: pd.DataFrame,
    ascending: bool = False
) -> tuple:
    """
    Processes three player performance DataFrames by summarizing and sorting each.

    Parameters:
    - section1 (pd.DataFrame): DataFrame for the first season section.
    - section2 (pd.DataFrame): DataFrame for the second season section.
    - section3 (pd.DataFrame): DataFrame for the third season section.
    - ascending (bool): Sort order for Weighted_Sum. True for ascending, False for descending. Default is False.

    Returns:
    - tuple: A tuple containing three summarized and sorted DataFrames (summary1, summary2, summary3).
    """
    summary1 = summarize_player_performance(
        data=section1,
        sort_by_weighted_sum=True,
        ascending=ascending
    )
    
    summary2 = summarize_player_performance(
        data=section2,
        sort_by_weighted_sum=True,
        ascending=ascending
    )
    
    summary3 = summarize_player_performance(
        data=section3,
        sort_by_weighted_sum=True,
        ascending=ascending
    )
    
    return summary1, summary2, summary3


In [33]:
# ### 2024 example

# players = [
#     "Nikola Jokić",
#     "Shai Gilgeous-Alexander",
#     "Luka Dončić",
#     "Giannis Antetokounmpo",
#     "Jalen Brunson",
#     "Jayson Tatum",
#     "Anthony Edwards",
#     "Domantas Sabonis",
#     "Kevin Durant"
# ]

# target_year = 2024
# include_playoffs = False

# ### everythin above are inputs
# ###
# ### everything below is the function

# mvp_stats = collect_player_stats(players, target_year, include_playoffs, output_file="MVP_stats_DIRTY.csv")
# mvp_stats = normalize_player_stats(mvp_stats, 'min-max')
# temp1, temp2 , temp3 = separate_by_date(mvp_stats, ["first.csv", "second.csv", "third.csv"])



# # Define features and target
# X = mvp_stats[['PTS', 'AST', 'TRB']]
# y = mvp_stats['Position']

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.3, random_state=42, stratify=y
# )

# # Initialize the Logistic Regression model
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# # Train the model
# model.fit(X_train, y_train)


# # Make predictions
# y_pred = model.predict(X_test)

# # Calculate accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print(f"\nModel Accuracy: {accuracy:.2f}")

# # Confusion Matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# print("\nConfusion Matrix:")
# print(conf_matrix)

# # Classification Report
# class_report = classification_report(y_test, y_pred)
# print("\nClassification Report:")
# print(class_report)

# # After training the model, extract the weights (coefficients)
# weights = model.coef_
# intercepts = model.intercept_

# # Print the weights and intercepts
# print("Weights (coefficients):", weights)
# print("Intercepts:", intercepts)

# # Calculate the sum of the absolute values of the coefficients for each feature
# feature_importance = abs(weights).sum(axis=0)
# importance_dict = dict(zip(['PTS', 'AST', 'TRB'], feature_importance))

# # Display the importance of each feature
# print("Feature importance:")
# for feature, importance in importance_dict.items():
#     print(f"{feature}: {importance}")


# # Calculate the average of the absolute values of the coefficients for each feature
# feature_importance_avg = abs(weights).mean(axis=0)
# importance_avg_dict = dict(zip(['PTS', 'AST', 'TRB'], feature_importance_avg))

# # Display the average importance of each feature
# print("Average feature importance:")
# for feature, importance in importance_avg_dict.items():
#     print(f"{feature}: {importance}")

# # Normalize the importance to get percentages
# total_importance = feature_importance.sum()
# feature_importance_percentage = {
#     feature: (importance / total_importance) * 100
#     for feature, importance in importance_dict.items()
# }

# # Display the percentage contribution of each feature
# print("Feature importance as percentages:")
# for feature, percentage in feature_importance_percentage.items():
#     print(f"{feature}: {percentage:.2f}%")




# scaled_mvp_stats = scale_and_sum_stats(mvp_stats, feature_importance_percentage)
# scaled_mvp_stats.to_csv("scaled_mvp_values.csv", index=False)

# firstSection, secondSection , thirdSection = separate_by_date(scaled_mvp_stats, ["first_adjusted.csv", "second_adjusted.csv", "third_adjusted.csv"])

# reducedFirst, reducedSecond, reducedThird = process_three_sections(firstSection, secondSection, thirdSection)

# print("\n\n Averaged per game score per player:\n",reducedFirst,"\n\n", reducedSecond,"\n\n" ,reducedThird)

Fetching Player Stats: 100%|[92m██████████[0m| 9/9 [00:58<00:00]
Normalizing Columns: 100%|[92m██████████[0m | 3/3 [00:00<00:00]


Date Divisions:
Division 1: 2023-10-24 to 2023-12-20
Division 2: 2023-12-21 to 2024-02-16
Division 3: 2024-02-17 to 2024-04-14

Saving Divisions to CSV Files:


Saving Divisions: 100%|[92m██████████[0m| 3/3 [00:00<00:00]


Division 1 saved to first.csv
Division 2 saved to second.csv
Division 3 saved to third.csv

Model Accuracy: 0.41

Confusion Matrix:
[[ 6  1  0  2  1  0  0 14  0]
 [ 0  1  2  1 11  2  3  0  3]
 [ 2  0 14  1  2  0  1  1  0]
 [ 5  0  1  5  1  2  1  7  0]
 [ 0  0  1  0 19  0  2  1  0]
 [ 0  0  1  3  3  7  4  0  4]
 [ 0  0  0  4  5  0 10  0  5]
 [ 0  0  1  1  0  1  1 21  0]
 [ 1  2  2  0  5  1  8  1  2]]

Classification Report:
              precision    recall  f1-score   support

           1       0.43      0.25      0.32        24
           2       0.25      0.04      0.07        23
           3       0.64      0.67      0.65        21
           4       0.29      0.23      0.26        22
           5       0.40      0.83      0.54        23
           6       0.54      0.32      0.40        22
           7       0.33      0.42      0.37        24
           8       0.47      0.84      0.60        25
           9       0.14      0.09      0.11        22

    accuracy                   

Saving Divisions: 100%|[92m██████████[0m| 3/3 [00:00<00:00]


Division 1 saved to first_adjusted.csv
Division 2 saved to second_adjusted.csv
Division 3 saved to third_adjusted.csv


 Averaged per game score per player:
                     Player  Position  Weighted_Sum
0             Nikola Jokić         1      0.463385
1              Luka Dončić         3      0.404270
2         Domantas Sabonis         8      0.402250
3    Giannis Antetokounmpo         4      0.387814
4             Jayson Tatum         6      0.312866
5  Shai Gilgeous-Alexander         2      0.302348
6             Kevin Durant         9      0.298921
7          Anthony Edwards         7      0.259820
8            Jalen Brunson         5      0.246055 

                     Player  Position  Weighted_Sum
0         Domantas Sabonis         8      0.483016
1              Luka Dončić         3      0.446423
2             Nikola Jokić         1      0.432856
3    Giannis Antetokounmpo         4      0.425768
4             Jayson Tatum         6      0.327738
5  Shai Gilgeous-Alexan

In [38]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report
)

# Assuming the following helper functions are defined elsewhere:
# collect_player_stats, normalize_player_stats, separate_by_date,
# scale_and_sum_stats, process_three_sections

def generate_reduced_sections(players, target_year, include_playoffs):
    """
    Processes player statistics to generate reduced datasets for each season section.
    
    Parameters:
    - players (list): List of player names.
    - target_year (int): The target season year (e.g., 2024).
    - include_playoffs (bool): Whether to include playoff statistics.
    
    Returns:
    - tuple: A tuple containing three DataFrames (reducedFirst, reducedSecond, reducedThird).
    """
    # Step 1: Collect Player Statistics
    print("Collecting player statistics...")
    mvp_stats = collect_player_stats(
        players,
        target_year,
        include_playoffs,
        output_file="MVP_stats_DIRTY.csv"
    )
    print("Player statistics collected.\n")
    
    # Step 2: Normalize Player Statistics
    print("Normalizing player statistics using Min-Max scaling...")
    mvp_stats = normalize_player_stats(mvp_stats, 'min-max')
    print("Normalization complete.\n")
    
    # Step 3: Separate Data by Date into Three Sections
    print("Separating data into three sections based on dates...")
    temp1, temp2, temp3 = separate_by_date(
        mvp_stats,
        ["first.csv", "second.csv", "third.csv"]
    )
    print("Data separation complete.\n")
    
    # Step 4: Define Features and Target
    print("Defining features and target variable...")
    X = mvp_stats[['PTS', 'AST', 'TRB']]
    y = mvp_stats['Position']
    print("Features and target defined.\n")
    
    # Step 5: Split the Data into Training and Testing Sets
    print("Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.3,
        random_state=42,
        stratify=y
    )
    print("Data splitting complete.\n")
    
    # Step 6: Initialize and Train the Logistic Regression Model
    print("Initializing and training the Logistic Regression model...")
    model = LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000  # Increased for convergence
    )
    model.fit(X_train, y_train)
    print("Model training complete.\n")
    
    # Step 7: Make Predictions
    print("Making predictions on the test set...")
    y_pred = model.predict(X_test)
    print("Predictions complete.\n")
    
    # Step 8: Evaluate Model Performance
    print("Evaluating model performance...")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nModel Accuracy: {accuracy:.2f}\n")
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix, "\n")
    
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)
    
    # Step 9: Extract Weights (Coefficients) and Intercepts
    print("Extracting model coefficients and intercepts...")
    weights = model.coef_
    intercepts = model.intercept_
    print("Weights (coefficients):", weights)
    print("Intercepts:", intercepts, "\n")
    
    # Step 10: Calculate Feature Importance
    print("Calculating feature importance based on coefficients...")
    feature_importance = abs(weights).sum(axis=0)
    importance_dict = dict(zip(['PTS', 'AST', 'TRB'], feature_importance))
    
    print("Feature importance (sum of absolute coefficients):")
    for feature, importance in importance_dict.items():
        print(f"{feature}: {importance}")
    print()
    
    feature_importance_avg = abs(weights).mean(axis=0)
    importance_avg_dict = dict(zip(['PTS', 'AST', 'TRB'], feature_importance_avg))
    
    print("Average feature importance (mean of absolute coefficients):")
    for feature, importance in importance_avg_dict.items():
        print(f"{feature}: {importance}")
    print()
    
    # Normalize the importance to get percentages
    total_importance = feature_importance.sum()
    feature_importance_percentage = {
        feature: (importance / total_importance) * 100
        for feature, importance in importance_dict.items()
    }
    
    print("Feature importance as percentages:")
    for feature, percentage in feature_importance_percentage.items():
        print(f"{feature}: {percentage:.2f}%")
    print()
    
    # Step 11: Scale and Sum Statistics Based on Feature Importance
    print("Scaling and summing statistics based on feature importance percentages...")
    scaled_mvp_stats = scale_and_sum_stats(mvp_stats, feature_importance_percentage)
    scaled_mvp_stats.to_csv("scaled_mvp_values.csv", index=False)
    print("Scaling and summing complete. Saved to 'scaled_mvp_values.csv'.\n")
    
    # Step 12: Separate Scaled Data by Date into Three Adjusted Sections
    print("Separating scaled data into three adjusted sections based on dates...")
    firstSection, secondSection, thirdSection = separate_by_date(
        scaled_mvp_stats,
        ["first_adjusted.csv", "second_adjusted.csv", "third_adjusted.csv"]
    )
    print("Data separation into adjusted sections complete.\n")
    
    # Step 13: Process the Three Adjusted Sections
    print("Processing the three adjusted sections to obtain reduced datasets...")
    reducedFirst, reducedSecond, reducedThird = process_three_sections(
        firstSection,
        secondSection,
        thirdSection
    )
    print("Processing complete.\n")
    
    # Optional: Display the reduced datasets
    print("\nAveraged per game score per player for Section 1:")
    print(reducedFirst, "\n")
    
    print("Averaged per game score per player for Section 2:")
    print(reducedSecond, "\n")
    
    print("Averaged per game score per player for Section 3:")
    print(reducedThird, "\n")
    
    # Return the reduced datasets
    return reducedFirst, reducedSecond, reducedThird


In [39]:
players = [
    "Nikola Jokić",
    "Shai Gilgeous-Alexander",
    "Luka Dončić",
    "Giannis Antetokounmpo",
    "Jalen Brunson",
    "Jayson Tatum",
    "Anthony Edwards",
    "Domantas Sabonis",
    "Kevin Durant"
]

target_year = 2024
include_playoffs = False


TwentyFourfirst,TwentyFoursecond,TwentyFourthird = generate_reduced_sections(players, target_year, include_playoffs)

Fetching Player Stats: 100%|[92m██████████[0m| 9/9 [01:01<00:00]


Player statistics collected.

Normalizing player statistics using Min-Max scaling...


Normalizing Columns: 100%|[92m██████████[0m | 3/3 [00:00<00:00]


Normalization complete.

Separating data into three sections based on dates...
Date Divisions:
Division 1: 2023-10-24 to 2023-12-20
Division 2: 2023-12-21 to 2024-02-16
Division 3: 2024-02-17 to 2024-04-14

Saving Divisions to CSV Files:


Saving Divisions: 100%|[92m██████████[0m| 3/3 [00:00<00:00]


Division 1 saved to first.csv
Division 2 saved to second.csv
Division 3 saved to third.csv
Data separation complete.

Defining features and target variable...
Features and target defined.

Splitting data into training and testing sets...
Data splitting complete.

Initializing and training the Logistic Regression model...
Model training complete.

Making predictions on the test set...
Predictions complete.

Evaluating model performance...

Model Accuracy: 0.41

Confusion Matrix:
[[ 6  1  0  2  1  0  0 14  0]
 [ 0  1  2  1 11  2  3  0  3]
 [ 2  0 14  1  2  0  1  1  0]
 [ 5  0  1  5  1  2  1  7  0]
 [ 0  0  1  0 19  0  2  1  0]
 [ 0  0  1  3  3  7  4  0  4]
 [ 0  0  0  4  5  0 10  0  5]
 [ 0  0  1  1  0  1  1 21  0]
 [ 1  2  2  0  5  1  8  1  2]] 

Classification Report:
              precision    recall  f1-score   support

           1       0.43      0.25      0.32        24
           2       0.25      0.04      0.07        23
           3       0.64      0.67      0.65        21
    

Saving Divisions: 100%|[92m██████████[0m| 3/3 [00:00<00:00]

Division 1 saved to first_adjusted.csv
Division 2 saved to second_adjusted.csv
Division 3 saved to third_adjusted.csv
Data separation into adjusted sections complete.

Processing the three adjusted sections to obtain reduced datasets...
Processing complete.


Averaged per game score per player for Section 1:
                    Player  Position  Weighted_Sum
0             Nikola Jokić         1      0.463385
1              Luka Dončić         3      0.404270
2         Domantas Sabonis         8      0.402250
3    Giannis Antetokounmpo         4      0.387814
4             Jayson Tatum         6      0.312866
5  Shai Gilgeous-Alexander         2      0.302348
6             Kevin Durant         9      0.298921
7          Anthony Edwards         7      0.259820
8            Jalen Brunson         5      0.246055 

Averaged per game score per player for Section 2:
                    Player  Position  Weighted_Sum
0         Domantas Sabonis         8      0.483016
1              Luka Dončić




In [42]:
def generate_reduced_sections_for_all_years(players_dict, include_playoffs):
    """
    Processes player statistics to generate reduced datasets for each season section for all years.
    
    Parameters:
    - players_dict (dict): Dictionary where keys are years and values are lists of player names.
    - include_playoffs (bool): Whether to include playoff statistics.
    
    Returns:
    - dict: A dictionary containing reduced sections for all years.
            The keys will be the years, and the values will be tuples (reducedFirst, reducedSecond, reducedThird).
    """
    all_years_data = {}
    
    # Iterate over each year and its corresponding player list
    for year, player_list in players_dict.items():
        print(f"\nProcessing data for the year {year}...\n")
        
        # Generate the reduced sections for this year
        reducedFirst, reducedSecond, reducedThird = generate_reduced_sections(player_list, year, include_playoffs)
        
        # Store the results in a dictionary
        all_years_data[year] = (reducedFirst, reducedSecond, reducedThird)
        
    return all_years_data


In [49]:
def create_sliding_windows(data: dict, window_size: int = 3) -> dict:
    """
    Creates a dictionary of sliding windows from the input data.

    Parameters:
    - data (dict): Dictionary with years as keys.
    - window_size (int): Size of the sliding window (default is 3).

    Returns:
    - dict: Dictionary of dictionaries representing sliding windows.
    """
    # Sort the years in descending order
    sorted_years = sorted(data.keys(), reverse=True)
    n = len(sorted_years)
    windows = {}

    for i in range(n):
        window = {}
        
        # Handle the first window (use pair if not enough left)
        if i == 0:
            end = i + window_size if (i + window_size) <= n else n
            window_years = sorted_years[i:i + window_size]
        # Handle the last window (use pair if not enough right)
        elif i == n - 1:
            start = i - window_size + 1 if (i - window_size + 1) >= 0 else 0
            window_years = sorted_years[start:i + 1]
        else:
            # For middle windows, attempt to take window_size elements
            window_years = sorted_years[i:i + window_size]
            # If not enough elements to the right, adjust to include more from the left
            if len(window_years) < window_size and i >= window_size - 1:
                start = i - window_size + 1
                window_years = sorted_years[start:i + 1]
        
        # Ensure window_years has at least two years
        if len(window_years) < 2:
            # Attempt to include one more year from the left or right
            if i > 0:
                window_years = sorted_years[i - 1:i + 1]
            elif i < n - 1:
                window_years = sorted_years[i:i + 2]
        
        # Create the window dictionary
        for year in window_years:
            window[year] = data[year]
        
        # Use the first year in the window as the key
        windows[window_years[0]] = window

    return windows

# Data set

In [51]:
players = {2024: [
    "Nikola Jokić",
    "Shai Gilgeous-Alexander",
    "Luka Dončić",
    "Giannis Antetokounmpo",
    "Jalen Brunson",
    "Jayson Tatum",
    "Anthony Edwards",
    "Domantas Sabonis",
    "Kevin Durant"
]
,
2023: [
    "Joel Embiid",
    "Nikola Jokić",
    "Giannis Antetokounmpo",
    "Jayson Tatum",
    "Shai Gilgeous-Alexander",
    "Donovan Mitchell",
    "Domantas Sabonis",
    "Luka Dončić",
    "Stephen Curry",
    "Jimmy Butler",
    "De'Aaron Fox",
    "Jalen Brunson",
    "Ja Morant"
],
2022: [
    "Nikola Jokić",
    "Joel Embiid",
    "Giannis Antetokounmpo",
    "Devin Booker",
    "Luka Dončić",
    "Jayson Tatum",
    "Ja Morant",
    "Stephen Curry",
    "Chris Paul",
    "DeMar DeRozan",
    "Kevin Durant",
    "LeBron James"
], 
}

windows = create_sliding_windows(players, 3, )
for key, window in windows.items():
    print(f"{key}: {window}")

# # Example usage:
# all_reduced_sections = generate_reduced_sections_for_all_years(players, include_playoffs=False)

# # Optional: Display the results
# for year, sections in all_reduced_sections.items():
#     print(f"\nYear: {year}")
#     print("Reduced Section 1:\n", sections[0])
#     print("Reduced Section 2:\n", sections[1])
#     print("Reduced Section 3:\n", sections[2])


2024: {2024: ['Nikola Jokić', 'Shai Gilgeous-Alexander', 'Luka Dončić', 'Giannis Antetokounmpo', 'Jalen Brunson', 'Jayson Tatum', 'Anthony Edwards', 'Domantas Sabonis', 'Kevin Durant'], 2023: ['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić', 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant'], 2022: ['Nikola Jokić', 'Joel Embiid', 'Giannis Antetokounmpo', 'Devin Booker', 'Luka Dončić', 'Jayson Tatum', 'Ja Morant', 'Stephen Curry', 'Chris Paul', 'DeMar DeRozan', 'Kevin Durant', 'LeBron James']}
2023: {2023: ['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić', 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant'], 2022: ['Nikola Jokić', 'Joel Embiid', 'Giannis Antetokounmpo', 'Devin Booker', 'Luka Dončić', 'Jayson Tatum', 'Ja Morant'