# Data Processing

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
RAW_DATA_DIR = Path('C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba')

# Remplacez 'nom_du_fichier.csv' par le nom de votre fichier CSV
filename = RAW_DATA_DIR / '2023-2024 NBA Player Stats - Regular.csv'
print(f"Running on file: {filename}")
players_df = pd.read_csv(filename, sep=',', encoding='Windows-1252')
players_csv = pd.read_csv(filename, sep=',', encoding='Windows-1252')

Running on file: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\2023-2024 NBA Player Stats - Regular.csv


In [3]:
def save_dataframe_as_parquet(dataframe, destination_dir, parquet_filename):
    # Vérifier si le DataFrame est vide
    if dataframe.empty:
        raise ValueError("Le DataFrame est vide.")

    # Créer le chemin du répertoire de destination s'il n'existe pas
    Path(destination_dir).mkdir(parents=True, exist_ok=True)

    # Définir le chemin complet du fichier de destination
    destination_file = Path(destination_dir) / parquet_filename

    # Sauvegarder le DataFrame au format Parquet
    dataframe.to_parquet(destination_file, index=False)

    print(f"DataFrame saved as Parquet: {destination_file}")

# Exemple d'utilisation
# save_dataframe_as_parquet(players, 'C:\\chemin\\de\\destination', 'nom_du_fichier.parquet')


In [4]:
save_dataframe_as_parquet(players_df, 'C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba\\stockage', 'players_raw.parquet')

DataFrame saved as Parquet: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\stockage\players_raw.parquet


In [5]:


def process_players_data(players):
    # Assurer que le DataFrame n'est pas vide
    if players.empty:
        raise ValueError("Le DataFrame 'players' est vide.")

    # Ajouter la colonne EFF
    players["EFF"] = players.PTS + players.TRB + players.AST + players.STL + players.BLK - \
                     (players.FGA - players.FG) - (players.FTA - players.FT) - players.TOV

    # Calculer les statistiques d'âge et de points
    ages = players.Age.describe().round(decimals=1)
    points = players.PTS.describe().round(decimals=1)

    # Définir les critères pour les futurs superstars
    young_age = ages["25%"]
    futur_super_star_def = f"(EFF >= 12) & (PTS >= 15) & (Age <= {young_age})"
    players.query(futur_super_star_def).sort_values("EFF", ascending=False).sort_values(["Age", "EFF"], ascending=True)

    # Calculer le pourcentage de tir
    players['TS%'] = np.where((2 * (players['FGA'] + 0.44 * players['FTA'])) != 0, 
                              players['PTS'] / (2 * (players['FGA'] + 0.44 * players['FTA'])), 0)

    # Mapper les positions des joueurs
    players["position"] = players.Pos.map({"PG": "Backcourt", "SG": "Backcourt", 
                                           "SF": "Wing", "SF-PF": "Wing", 
                                           "PF": "Big", "C": "Big"})

    # Retourner le DataFrame traité
    return players

# Utilisation de la fonction
# players = pd.read_csv('chemin_du_fichier.csv', sep=',', encoding='Windows-1252')
# players_processed = process_players_data(players)


In [6]:
players_df = process_players_data(players_df)

In [7]:
players_df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,EFF,TS%,position
0,1,Precious Achiuwa,C-PF,24,TOT,30,0,16.6,2.8,6.2,...,5.1,1.5,0.6,0.5,1.0,1.5,6.9,9.6,0.506162,
1,1,Precious Achiuwa,C,24,TOR,25,0,17.5,3.1,6.8,...,5.4,1.8,0.6,0.5,1.2,1.6,7.7,10.4,0.510069,Big
2,1,Precious Achiuwa,PF,24,NYK,5,0,12.0,1.2,3.4,...,3.6,0.4,0.2,0.6,0.4,1.0,2.6,4.8,0.382353,Big
3,2,Bam Adebayo,C,26,MIA,27,27,34.4,8.0,15.7,...,10.4,4.0,1.1,1.0,2.5,2.4,22.1,26.8,0.580235,Big
4,3,Ochai Agbaji,SG,23,UTA,39,10,20.3,2.4,5.3,...,2.5,1.0,0.6,0.5,0.7,1.5,6.1,7.0,0.556976,Backcourt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,518,Thaddeus Young,PF,35,TOR,10,2,8.7,1.6,2.6,...,1.6,1.4,0.7,0.1,0.7,0.8,3.4,5.3,0.622255,Big
542,519,Trae Young,PG,25,ATL,34,34,36.6,8.6,20.0,...,3.0,11.0,1.4,0.2,4.3,2.0,27.8,26.5,0.585510,Backcourt
543,520,Omer Yurtseven,C,25,UTA,22,6,11.3,1.6,3.4,...,4.4,0.6,0.2,0.5,1.0,1.5,3.7,6.4,0.511050,Big
544,521,Cody Zeller,C,31,NOP,28,0,9.0,0.7,1.8,...,2.8,1.1,0.1,0.1,0.4,1.3,2.1,4.2,0.451031,Big


stockage

In [8]:
save_dataframe_as_parquet(players_df, 'C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba\\stockage', 'players_final.parquet')

DataFrame saved as Parquet: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\stockage\players_final.parquet


# API request

In [9]:
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog
import pandas as pd

# Récupérer tous les joueurs
players_list = players.get_players()

# Choisissez un joueur pour l'exemple, ici on prend LeBron James
lebron = [player for player in players_list if player['full_name'] == 'LeBron James'][0]
lebron_id = lebron['id']

# Récupérer les logs de jeu pour une saison spécifique
gamelog = playergamelog.PlayerGameLog(player_id=lebron_id, season='2023-24')
df = gamelog.get_data_frames()[0]

df.head()


Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22023,2544,22300605,"JAN 21, 2024",LAL vs. POR,W,31,8,16,0.5,...,5,5,5,0,0,3,2,28,28,1
1,22023,2544,22300591,"JAN 19, 2024",LAL vs. BKN,L,34,9,22,0.409,...,6,11,5,0,0,4,1,24,-13,2
2,22023,2544,22300577,"JAN 17, 2024",LAL vs. DAL,W,33,10,19,0.526,...,7,8,8,1,0,3,1,25,20,1
3,22023,2544,22300565,"JAN 15, 2024",LAL vs. OKC,W,39,12,20,0.6,...,7,7,6,0,0,5,1,25,2,1
4,22023,2544,22300531,"JAN 11, 2024",LAL vs. PHX,L,24,3,11,0.273,...,5,5,9,1,0,4,1,10,-25,1


In [10]:
from nba_api.stats.endpoints import leagueleaders
import pandas as pd

# Créer une instance de LeagueLeaders
league_leaders = leagueleaders.LeagueLeaders(season='2023-24')

# Convertir en DataFrame
leaders_df = league_leaders.get_data_frames()[0]

# Afficher les premières lignes du DataFrame
leaders_df.head()

Unnamed: 0,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
0,203507,1,Giannis Antetokounmpo,1610612749,MIL,41,1444,473,784,0.603,...,472,251,56,46,149,119,1278,1487,1.69,0.38
1,1628983,2,Shai Gilgeous-Alexander,1610612760,OKC,41,1415,452,823,0.549,...,230,257,91,32,82,105,1274,1392,3.13,1.11
2,1629029,3,Luka Doncic,1610612742,DAL,36,1327,412,854,0.482,...,307,334,51,21,141,65,1208,1266,2.37,0.36
3,203954,4,Joel Embiid,1610612755,PHI,32,1096,386,716,0.539,...,370,188,37,60,116,92,1156,1321,1.62,0.32
4,203999,5,Nikola Jokic,1610612743,DEN,43,1443,437,744,0.587,...,513,391,50,40,124,113,1122,1641,3.15,0.4


In [11]:
print(leaders_df.columns)


Index(['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'AST_TOV', 'STL_TOV'],
      dtype='object')


In [12]:
players_csv.head()  

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa,C-PF,24,TOT,30,0,16.6,2.8,6.2,...,0.571,2.0,3.1,5.1,1.5,0.6,0.5,1.0,1.5,6.9
1,1,Precious Achiuwa,C,24,TOR,25,0,17.5,3.1,6.8,...,0.571,2.0,3.4,5.4,1.8,0.6,0.5,1.2,1.6,7.7
2,1,Precious Achiuwa,PF,24,NYK,5,0,12.0,1.2,3.4,...,0.0,2.0,1.6,3.6,0.4,0.2,0.6,0.4,1.0,2.6
3,2,Bam Adebayo,C,26,MIA,27,27,34.4,8.0,15.7,...,0.782,2.2,8.1,10.4,4.0,1.1,1.0,2.5,2.4,22.1
4,3,Ochai Agbaji,SG,23,UTA,39,10,20.3,2.4,5.3,...,0.733,0.8,1.7,2.5,1.0,0.6,0.5,0.7,1.5,6.1


In [13]:
df.columns

Index(['SEASON_ID', 'Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE'],
      dtype='object')

In [14]:
players_df.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'TS%', 'position'],
      dtype='object')

In [15]:
players_csv.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [16]:
# Exemple d'utilisation de leagueleaders
from nba_api.stats.endpoints import leagueleaders
import pandas as pd

# Créer une instance de LeagueLeaders
league_leaders = leagueleaders.LeagueLeaders(season='2023-24')

# Convertir en DataFrame
basic_stats_df = league_leaders.get_data_frames()[0]

# Exemple d'utilisation de commonplayerinfo
from nba_api.stats.endpoints import commonplayerinfo

# Vous devrez probablement boucler sur les identifiants des joueurs pour obtenir leurs informations
player_info = commonplayerinfo.CommonPlayerInfo(player_id='2544')
additional_info_df = player_info.get_data_frames()[0]
additional_info_df['PLAYER_ID'] = additional_info_df['PERSON_ID']
# Fusionner les DataFrames sur une clé commune, comme l'ID du joueur
combined_df = pd.merge(basic_stats_df, additional_info_df, on='PLAYER_ID')

combined_df['eFG%'] = (combined_df['FGM'] + 0.5 * combined_df['FG3M']) / combined_df['FGA']


In [21]:
combined_df.columns

Index(['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM_ID_x', 'TEAM', 'GP', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'AST_TOV', 'STL_TOV', 'PERSON_ID', 'FIRST_NAME', 'LAST_NAME',
       'DISPLAY_FIRST_LAST', 'DISPLAY_LAST_COMMA_FIRST', 'DISPLAY_FI_LAST',
       'PLAYER_SLUG', 'BIRTHDATE', 'SCHOOL', 'COUNTRY', 'LAST_AFFILIATION',
       'HEIGHT', 'WEIGHT', 'SEASON_EXP', 'JERSEY', 'POSITION', 'ROSTERSTATUS',
       'GAMES_PLAYED_CURRENT_SEASON_FLAG', 'TEAM_ID_y', 'TEAM_NAME',
       'TEAM_ABBREVIATION', 'TEAM_CODE', 'TEAM_CITY', 'PLAYERCODE',
       'FROM_YEAR', 'TO_YEAR', 'DLEAGUE_FLAG', 'NBA_FLAG', 'GAMES_PLAYED_FLAG',
       'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER', 'GREATEST_75_FLAG',
       'eFG%'],
      dtype='object')

In [24]:
from fuzzywuzzy import fuzz, process

# Define the target column names from players_csv
target_columns = [
    'Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
    '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
    'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'
]

# Initialize a dictionary to store the mapping of old column names to new ones
column_mapping = {}

# Iterate through the target columns and find the closest matching column in combined_df
for target_column in target_columns:
    best_match, score = process.extractOne(target_column, combined_df.columns)
    
    # Check if the similarity score is above a threshold (adjust this threshold as needed)
    if score >= 80:
        column_mapping[best_match] = target_column

# Rename the columns in combined_df using the mapping
combined_df.rename(columns=column_mapping, inplace=True)

# Now, combined_df should have columns matching the players_csv columns as closely as possible
# Define the columns from players_csv
target_columns = [
    'Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
    '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
    'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'
]

# Select only the columns that match between combined_df and target_columns
common_columns = combined_df.columns.intersection(target_columns)

# Create a new dataframe with only the common columns
filtered_df = combined_df[common_columns]

# Now, filtered_df contains only the columns that match with players_csv


In [25]:
filtered_df

Unnamed: 0,Player,GS,FG%,FGA,FT%,FTA,ORB,DRB,AST,STL,BLK,TOV,PF,PTS,Pos,eFG%
0,LeBron James,40,370,712,165,223,37,252,297,53,23,134,43,991,Forward,0.580056
