# Data Processing

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
RAW_DATA_DIR = Path('C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba')

# Remplacez 'nom_du_fichier.csv' par le nom de votre fichier CSV
filename = RAW_DATA_DIR / '2023-2024 NBA Player Stats - Regular.csv'
print(f"Running on file: {filename}")
players = pd.read_csv(filename, sep=',', encoding='Windows-1252')

Running on file: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\2023-2024 NBA Player Stats - Regular.csv


In [16]:
# Spécifiez le chemin de destination pour le fichier Parquet
destination_dir = Path('C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba\\stockage')

# Assurez-vous que le dossier de destination existe
destination_dir.mkdir(parents=True, exist_ok=True)

# Utilisez la méthode to_parquet pour stocker le DataFrame au format Parquet
destination_file = destination_dir / 'players_raw.parquet'
players.to_parquet(destination_file, index=False)

print(f"DataFrame saved as Parquet: {destination_file}")

DataFrame saved as Parquet: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\stockage\players_raw.parquet


In [17]:
players.sort_values(by=['Player'], ascending=True).head(5)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
176,165,A.J. Green,SG,24,MIL,23,0,7.2,1.1,2.3,...,1.0,0.2,0.6,0.7,0.6,0.0,0.0,0.0,0.8,3.1
272,261,A.J. Lawson,SG,23,DAL,16,0,10.3,1.8,3.7,...,0.714,0.5,0.9,1.4,0.6,0.6,0.1,0.5,0.8,4.8
182,171,AJ Griffin,SF,20,ATL,13,0,9.0,0.8,2.8,...,1.0,0.2,0.8,0.9,0.3,0.1,0.0,0.5,0.5,2.5
172,161,Aaron Gordon,PF,28,DEN,33,33,31.5,5.5,10.2,...,0.636,2.8,3.8,6.6,3.2,0.9,0.6,1.3,1.7,13.8
210,199,Aaron Holiday,PG,27,HOU,34,1,18.0,2.5,5.7,...,0.917,0.3,1.7,2.0,1.7,0.5,0.1,0.8,1.6,6.9


In [18]:
assert sum(players.isnull().sum()) == 0, "There are not null values in the dataset"

In [19]:
players["EFF"] = players.PTS + players.TRB + players.AST + players.STL + players.BLK - (players.FGA - players.FG) - (players.FTA - players.FT) - players.TOV

In [20]:
ages = players.Age.describe().round(decimals=1) # used to specify the first 25%, defining what is a young player
points = players.PTS.describe().round(decimals=1)

In [21]:
# With the graph below, we can see that within <23y (what we have defined to be a young age), if we have more than 15 points we are special. 
# Those data will then be used to filter the current base player and keep only special ones.

young_age = ages["25%"]
futur_super_star_def = f"(EFF >= 12) & (PTS >= 15) & (Age <= {young_age})"
players.query(futur_super_star_def).sort_values("EFF", ascending=False).sort_values(["Age", "EFF"], ascending=True)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,EFF
442,423,Shaedon Sharpe,SG,20,POR,31,25,33.5,5.6,13.5,...,1.3,3.7,5.0,3.0,0.9,0.4,2.3,2.7,16.2,14.7
515,492,Victor Wembanyama,PF,20,SAS,32,32,28.9,7.1,15.8,...,2.1,8.1,10.2,3.1,1.2,3.2,3.1,2.3,19.2,24.2
179,168,Jalen Green,SG,21,HOU,36,36,30.4,5.8,14.3,...,0.4,4.3,4.6,3.3,0.6,0.3,2.2,1.3,17.3,14.5
27,24,Paolo Banchero,PF,21,ORL,37,37,35.0,8.1,17.7,...,1.2,5.9,7.1,4.8,1.1,0.6,3.3,2.2,22.9,21.4
214,203,Chet Holmgren,C,21,OKC,36,36,30.1,6.7,12.1,...,1.5,5.9,7.4,2.7,0.6,2.6,1.8,2.7,17.9,23.3
437,418,Alperen ?engün,C,21,HOU,36,36,32.3,8.5,15.8,...,2.6,6.4,9.0,5.0,1.2,0.7,2.5,3.4,21.6,26.1
474,453,Cam Thomas,SG,22,BRK,28,20,28.5,7.5,17.4,...,0.3,2.2,2.5,2.1,0.6,0.3,1.6,1.9,20.3,13.5
526,503,Jalen Williams,PF,22,OKC,33,33,32.1,7.0,12.9,...,0.4,3.7,4.1,4.4,1.1,0.5,1.9,2.7,18.2,19.9
113,104,Cade Cunningham,PG,22,DET,36,36,34.5,8.5,19.0,...,0.4,3.6,4.1,7.3,1.0,0.3,3.8,2.8,22.8,20.6
502,479,Franz Wagner,SF,22,ORL,34,34,33.4,7.7,16.5,...,1.1,4.8,5.9,3.9,1.1,0.4,1.9,2.3,20.9,20.8


In [22]:
players['TS%'] = np.where((2 * (players['FGA'] + 0.44 * players['FTA'])) != 0, players['PTS'] / (2 * (players['FGA'] + 0.44 * players['FTA'])), 0)

In [23]:
players["position"] = players.Pos.map({"PG": "Backcourt", "SG": "Backcourt", "SF": "Wing", "SF-PF": "Wing", "PF": "Big", "C": "Big", })

In [24]:
# Spécifiez le chemin de destination pour le fichier Parquet
destination_dir = Path('C:\\Users\\loicc\\OneDrive - Efrei\\Bureau\\COURS\\M2\\S9\\Machine Learning in Production\\Data Pipeline\\mlops-nba\\stockage')

# Assurez-vous que le dossier de destination existe
destination_dir.mkdir(parents=True, exist_ok=True)

# Utilisez la méthode to_parquet pour stocker le DataFrame au format Parquet
destination_file = destination_dir / 'players_final.parquet'
players.to_parquet(destination_file, index=False)

print(f"DataFrame saved as Parquet: {destination_file}")


DataFrame saved as Parquet: C:\Users\loicc\OneDrive - Efrei\Bureau\COURS\M2\S9\Machine Learning in Production\Data Pipeline\mlops-nba\stockage\players_final.parquet


# API request