# Data Processing

In [31]:
import pandas as pd
import numpy as np
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
RAW_DATA_DIR = Path('C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba')

# Remplacez 'nom_du_fichier.csv' par le nom de votre fichier CSV
filename = RAW_DATA_DIR / '2023-2024 NBA Player Stats - Regular.csv'
print(f"Running on file: {filename}")
players = pd.read_csv(filename, sep=',', encoding='Windows-1252')

Running on file: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\2023-2024 NBA Player Stats - Regular.csv


In [33]:
def save_dataframe_as_parquet(dataframe, destination_dir, parquet_filename):
    # Vérifier si le DataFrame est vide
    if dataframe.empty:
        raise ValueError("Le DataFrame est vide.")

    # Créer le chemin du répertoire de destination s'il n'existe pas
    Path(destination_dir).mkdir(parents=True, exist_ok=True)

    # Définir le chemin complet du fichier de destination
    destination_file = Path(destination_dir) / parquet_filename

    # Sauvegarder le DataFrame au format Parquet
    dataframe.to_parquet(destination_file, index=False)

    print(f"DataFrame saved as Parquet: {destination_file}")

# Exemple d'utilisation
# save_dataframe_as_parquet(players, 'C:\\chemin\\de\\destination', 'nom_du_fichier.parquet')


In [34]:
save_dataframe_as_parquet(players, 'C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba\\stockage', 'players_raw.parquet')

DataFrame saved as Parquet: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\stockage\players_raw.parquet


In [35]:


def process_players_data(players):
    # Assurer que le DataFrame n'est pas vide
    if players.empty:
        raise ValueError("Le DataFrame 'players' est vide.")

    # Ajouter la colonne EFF
    players["EFF"] = players.PTS + players.TRB + players.AST + players.STL + players.BLK - \
                     (players.FGA - players.FG) - (players.FTA - players.FT) - players.TOV

    # Calculer les statistiques d'âge et de points
    ages = players.Age.describe().round(decimals=1)
    points = players.PTS.describe().round(decimals=1)

    # Définir les critères pour les futurs superstars
    young_age = ages["25%"]
    futur_super_star_def = f"(EFF >= 12) & (PTS >= 15) & (Age <= {young_age})"
    players.query(futur_super_star_def).sort_values("EFF", ascending=False).sort_values(["Age", "EFF"], ascending=True)

    # Calculer le pourcentage de tir
    players['TS%'] = np.where((2 * (players['FGA'] + 0.44 * players['FTA'])) != 0, 
                              players['PTS'] / (2 * (players['FGA'] + 0.44 * players['FTA'])), 0)

    # Mapper les positions des joueurs
    players["position"] = players.Pos.map({"PG": "Backcourt", "SG": "Backcourt", 
                                           "SF": "Wing", "SF-PF": "Wing", 
                                           "PF": "Big", "C": "Big"})

    # Retourner le DataFrame traité
    return players

# Utilisation de la fonction
# players = pd.read_csv('chemin_du_fichier.csv', sep=',', encoding='Windows-1252')
# players_processed = process_players_data(players)


In [36]:
players = process_players_data(players)

In [37]:
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,EFF,TS%,position
0,1,Precious Achiuwa,C-PF,24,TOT,30,0,16.6,2.8,6.2,...,5.1,1.5,0.6,0.5,1.0,1.5,6.9,9.6,0.506162,
1,1,Precious Achiuwa,C,24,TOR,25,0,17.5,3.1,6.8,...,5.4,1.8,0.6,0.5,1.2,1.6,7.7,10.4,0.510069,Big
2,1,Precious Achiuwa,PF,24,NYK,5,0,12.0,1.2,3.4,...,3.6,0.4,0.2,0.6,0.4,1.0,2.6,4.8,0.382353,Big
3,2,Bam Adebayo,C,26,MIA,27,27,34.4,8.0,15.7,...,10.4,4.0,1.1,1.0,2.5,2.4,22.1,26.8,0.580235,Big
4,3,Ochai Agbaji,SG,23,UTA,39,10,20.3,2.4,5.3,...,2.5,1.0,0.6,0.5,0.7,1.5,6.1,7.0,0.556976,Backcourt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,518,Thaddeus Young,PF,35,TOR,10,2,8.7,1.6,2.6,...,1.6,1.4,0.7,0.1,0.7,0.8,3.4,5.3,0.622255,Big
542,519,Trae Young,PG,25,ATL,34,34,36.6,8.6,20.0,...,3.0,11.0,1.4,0.2,4.3,2.0,27.8,26.5,0.585510,Backcourt
543,520,Omer Yurtseven,C,25,UTA,22,6,11.3,1.6,3.4,...,4.4,0.6,0.2,0.5,1.0,1.5,3.7,6.4,0.511050,Big
544,521,Cody Zeller,C,31,NOP,28,0,9.0,0.7,1.8,...,2.8,1.1,0.1,0.1,0.4,1.3,2.1,4.2,0.451031,Big


stockage

In [38]:
save_dataframe_as_parquet(players, 'C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba\\stockage', 'players_final.parquet')

DataFrame saved as Parquet: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\stockage\players_final.parquet


# API request

In [39]:
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog
import pandas as pd

# Récupérer tous les joueurs
players_list = players.get_players()

# Choisissez un joueur pour l'exemple, ici on prend LeBron James
lebron = [player for player in players_list if player['full_name'] == 'LeBron James'][0]
lebron_id = lebron['id']

# Récupérer les logs de jeu pour une saison spécifique
gamelog = playergamelog.PlayerGameLog(player_id=lebron_id, season='2023-24')
df = gamelog.get_data_frames()[0]

print(df.head())


  SEASON_ID  Player_ID     Game_ID     GAME_DATE      MATCHUP WL  MIN  FGM   
0     22023       2544  0022300605  JAN 21, 2024  LAL vs. POR  W   31    8  \
1     22023       2544  0022300591  JAN 19, 2024  LAL vs. BKN  L   34    9   
2     22023       2544  0022300577  JAN 17, 2024  LAL vs. DAL  W   33   10   
3     22023       2544  0022300565  JAN 15, 2024  LAL vs. OKC  W   39   12   
4     22023       2544  0022300531  JAN 11, 2024  LAL vs. PHX  L   24    3   

   FGA  FG_PCT  ...  DREB  REB  AST  STL  BLK  TOV  PF  PTS  PLUS_MINUS   
0   16   0.500  ...     5    5    5    0    0    3   2   28          28  \
1   22   0.409  ...     6   11    5    0    0    4   1   24         -13   
2   19   0.526  ...     7    8    8    1    0    3   1   25          20   
3   20   0.600  ...     7    7    6    0    0    5   1   25           2   
4   11   0.273  ...     5    5    9    1    0    4   1   10         -25   

   VIDEO_AVAILABLE  
0                1  
1                2  
2                