In [None]:
# Dataset loading
from nba_longevity.infrastructure.dataset.csv_dataset_loader import CsvDatasetLoader

# Preprocessing
from nba_longevity.infrastructure.preprocessing.pandas_preprocessing_adapter import (
    PandasPreprocessingAdapter
)

# Feature engineering (option B)
from nba_longevity.infrastructure.feature_engineering.pandas_feature_engineering_adapter import (
    PandasFeatureEngineeringAdapter
)
from nba_longevity.infrastructure.feature_engineering.pandas_feature_selection_adapter import (
    PandasFeatureSelectionAdapter
)

# Feature spaces (Domain)
from nba_longevity.domain.features.feature_spaces import (
    FEATURE_SPACE_MINIMAL,
    FEATURE_SPACE_EXTENDED
)
from pathlib import Path
from nba_longevity.infrastructure.config.settings import load_infra_config
from nba_longevity.infrastructure.system_utils.root_finder import get_repository_root

# Initialisation de l'environnement
ROOT_PATH = get_repository_root(add_to_sys_path=False) 

full_path = f"{ROOT_PATH}/config/infra.yaml"
config = load_infra_config(Path(full_path))

# 1. Chargement des donn√©es
loader = CsvDatasetLoader(path=config.paths.raw_data)
dataset = loader.load()

# 2. Preprocessing (nettoyage)
preprocessor = PandasPreprocessingAdapter()
clean_dataset = preprocessor.preprocess(dataset)

# 3. Feature engineering (ajout uniquement)
feature_engineering_base = PandasFeatureEngineeringAdapter()
enriched_dataset = feature_engineering_base.add_features(clean_dataset)

# 4. Feature selection (projection ML)
feature_selector = PandasFeatureSelectionAdapter(
    feature_space=FEATURE_SPACE_MINIMAL  # ou FEATURE_SPACE_EXTENDED
)
feature_dataset = feature_selector.select_features(enriched_dataset)


# Le Domain consomme un flux
for i, row in enumerate(feature_dataset):
    # row est un dict-like
    print(row)
    print(len(row))
    if i >= 1:
        break  # Arr√™te la boucle si l'index est 1 ou plus (apr√®s avoir imprim√© 'row')

[32m2025-12-14 20:04:35[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:21 | Loading CSV dataset from path: /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv
[32m2025-12-14 20:04:35[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:25 | CSV loaded successfully | rows=1340 | cols=21
{'PointsPerMinute': 0.27007298284405173, 'FieldGoalEfficiency': 0.34210521814405026, 'ThreePointRate': 0.2763157531163483, 'FreeThrowRate': 0.08394160277585391, 'AssistToTurnoverRatio': 1.4615373372789713, 'ReboundRate': 0.14963503103521783, 'DefensiveImpact': 0.8, 'MinutesPerGame': 27.4, 'GamesPlayed': 36, 'Target5Years': 0}
10
{'PointsPerMinute': 0.2676579826149449, 'FieldGoalEfficiency': 0.2985074181332212, 'ThreePointRate': 0.4179103853865096, 'FreeThrowRate': 0.1263940473459462, 'AssistToTurnoverRatio': 2.3124985546884034, 'ReboundRate': 0.08921932753831496, 'DefensiveImpact': 1.6, '

In [None]:
from nba_longevity.application.training.run_training_pipeline import run_training

model = run_training(
    feature_space="minimal",
    model_type="catboost",  # ou "catboost"
)


[32m2025-12-14 21:43:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | run_training_pipeline:run_training:49 | üöÄ Starting training pipeline | model=catboost | feature_space=minimal
[32m2025-12-14 21:43:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | run_training_pipeline:run_training:67 | Using feature space with 9 features
[32m2025-12-14 21:43:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | run_training_pipeline:run_training:72 | üì• Loading raw dataset
[32m2025-12-14 21:43:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:21 | Loading CSV dataset from path: /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv
[32m2025-12-14 21:43:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:25 | CSV loaded successfully | rows=1340 | cols=21
[32m2025-12-14 21:43:16[0m | [1mINFO[0m | [36mnba_c

In [None]:
 from nba_longevity.domain.ports.predictor_port import XGBoostPredictor

# Train
# model = run_training(...)

# Predictor
predictor = XGBoostPredictor(model)

# Validation inference
valid_proba = predictor.predict_proba(
    rows=valid_rows,
    feature_columns=FEATURE_COLUMNS,
)

# Metrics
metrics = compute_classification_metrics(
    y_true=[row["TARGET_5YRS"] for row in valid_rows],
    y_proba=valid_proba,
)


In [None]:
# Dataset loading
from nba_longevity.infrastructure.dataset.csv_dataset_loader import CsvDatasetLoader

# Preprocessing
from nba_longevity.infrastructure.preprocessing.pandas_preprocessing_adapter import (
    PandasPreprocessingAdapter
)

# Feature engineering (option B)
from nba_longevity.infrastructure.feature_engineering.pandas_feature_engineering_adapter import (
    PandasFeatureEngineeringAdapter
)
from nba_longevity.infrastructure.feature_engineering.pandas_feature_selection_adapter import (
    PandasFeatureSelectionAdapter
)

# Feature spaces (Domain)
from nba_longevity.domain.features.feature_spaces import (
    FEATURE_SPACE_MINIMAL,
    FEATURE_SPACE_EXTENDED,
    TARGET_COLUMN
)

# Split Dataset
from nba_longevity.application.splitting.pandas_split import (split_train_valid_pandas)

from pathlib import Path
from nba_longevity.infrastructure.config.settings import load_infra_config
from nba_longevity.infrastructure.system_utils.root_finder import get_repository_root
from nba_longevity.infrastructure.training.xgboost_trainer import XGBoostTrainer
from nba_longevity.infrastructure.training.catboost_trainer import CatBoostTrainer

# Initialisation de l'environnement
ROOT_PATH = get_repository_root(add_to_sys_path=False) 

full_path = f"{ROOT_PATH}/config/infra.yaml"
config = load_infra_config(Path(full_path))

# 1. Chargement des donn√©es
loader = CsvDatasetLoader(path=config.paths.raw_data)
dataset = loader.load()

# 2. Preprocessing (nettoyage)
preprocessor = PandasPreprocessingAdapter()
clean_dataset = preprocessor.preprocess(dataset)

# 3. Feature engineering (ajout uniquement)
feature_engineering_base = PandasFeatureEngineeringAdapter()
enriched_dataset = feature_engineering_base.add_features(clean_dataset)

# 4. Feature selection (projection ML)
feature_selector = PandasFeatureSelectionAdapter(
    feature_space=FEATURE_SPACE_MINIMAL  # ou FEATURE_SPACE_EXTENDED
)
feature_dataset = feature_selector.select_features(enriched_dataset)

# 5. Split
train_rows, valid_rows = split_train_valid_pandas(
    rows=list(feature_dataset),
    target_column=TARGET_COLUMN,
    valid_size=0.2,
    seed=42
)


# 6. train xgboost model
trainer = XGBoostTrainer()  # ou CatBoostTrainer()

model = trainer.train(
    train_rows=train_rows,
    valid_rows=valid_rows,
    feature_columns=FEATURE_SPACE_MINIMAL,
    target_column=TARGET_COLUMN,
    params={
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "max_depth": 4,
        "eta": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "num_boost_round": 1000,
        "early_stopping_rounds": 50,
    }
)


[32m2025-12-14 21:27:34[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:21 | Loading CSV dataset from path: /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv
[32m2025-12-14 21:27:34[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:25 | CSV loaded successfully | rows=1340 | cols=21
[32m2025-12-14 21:27:34[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | pandas_feature_engineering_adapter:add_features:19 | D√©marrage du feature engineering (Pandas)
[32m2025-12-14 21:27:34[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | pandas_feature_engineering_adapter:add_features:30 | Cr√©ation des features d'usage et d'efficacit√©
[32m2025-12-14 21:27:34[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | pandas_feature_engineering_adapter:add_features:40 | Cr√©ation des features d'impact collectif
[32m2025-12-14 21

In [None]:
# Train
model = run_training(...)

# Predictor
predictor = XGBoostPredictor(model)

# Validation inference
valid_proba = predictor.predict_proba(
    rows=valid_rows,
    feature_columns=FEATURE_COLUMNS,
)

# Metrics
metrics = compute_classification_metrics(
    y_true=[row["TARGET_5YRS"] for row in valid_rows],
    y_proba=valid_proba,
)


In [None]:
from pandas import DataFrame, read_csv
from nba_longevity.application.bootstrap import app_logger

app_logger.info("Analyse Summarize")

def summarize_dataset(df: DataFrame, max_examples: int = 5) -> DataFrame:
    """
    Fournit une synth√®se d√©taill√©e du dataset :
    - Nom de colonne
    - Type de donn√©es
    - Nombre et pourcentage de valeurs manquantes
    - Cardinalit√© (nombre de valeurs uniques)
    - Exemples repr√©sentatifs

    Args:
        df (pd.DataFrame): Le DataFrame √† analyser.
        max_examples (int): Nombre maximum d'exemples √† afficher par colonne.

    Returns:
        pd.DataFrame: Tableau r√©capitulatif des colonnes et de leurs caract√©ristiques.
    """
    if df is None or df.empty:
        app_logger.warning("Aucun dataset fourni ou dataset vide.")
        return DataFrame(columns=["Column", "Type", "Missing", "% Missing", "Cardinality", "Examples"])

    app_logger.info("R√©sum√© d√©taill√© du dataset en cours d'analyse...")
    total_rows = len(df)
    column_details = []

    for col in df.columns:
        col_type = df[col].dtype

        # Valeurs manquantes
        missing_count = df[col].isna().sum()
        missing_pct = round((missing_count / total_rows) * 100, 2)

        # Cardinalit√©
        cardinality = df[col].nunique(dropna=True)

        # Exemples repr√©sentatifs
        unique_values = df[col].dropna().unique()
        if df[col].dtype == "object" or df[col].dtype.name == "category":
            examples = unique_values[:max_examples]
        else:
            examples = sorted(unique_values[:max_examples])

        column_details.append([
            col,
            col_type,
            missing_count,
            missing_pct,
            cardinality,
            examples
        ])

    # Cr√©ation du DataFrame r√©capitulatif
    summary_df = DataFrame(
        column_details,
        columns=["Column", "Type", "Missing", "% Missing", "Cardinality", "Examples"]
    ).sort_values(by="% Missing", ascending=False)

    # Logs synth√©tiques
    app_logger.info(f"R√©sum√© termin√© : {len(summary_df)} colonnes, {total_rows} lignes.")
    app_logger.debug(
        "Colonnes les plus incompl√®tes :\n%s",
        summary_df[["Column", "Missing", "% Missing"]].head(10)
    )

    return summary_df




full_path = f"{ROOT_PATH}/data/raw/nba_players.csv"


data = read_csv(full_path) 

display(data.head())

display(data.describe())

# En supposant que 'data' est votre DataFrame
display(data.dtypes)

summarize_data = summarize_dataset(data)

display(summarize_data)

[32m2025-12-14 17:39:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | 1236696816:<module>:4 | Analyse Summarize


Unnamed: 0,PlayerName,GamesPlayed,MinutesPerGame,PointsPerGame,FieldGoalsMade,FieldGoalsAttempted,FieldGoalPct,ThreePointersMade,ThreePointersAttempted,ThreePointerPct,...,FreeThrowsAttempted,FreeThrowPct,OffensiveRebounds,DefensiveRebounds,TotalRebounds,Assists,Steals,Blocks,Turnovers,Target5Years
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0


Unnamed: 0,GamesPlayed,MinutesPerGame,PointsPerGame,FieldGoalsMade,FieldGoalsAttempted,FieldGoalPct,ThreePointersMade,ThreePointersAttempted,ThreePointerPct,FreeThrowsMade,FreeThrowsAttempted,FreeThrowPct,OffensiveRebounds,DefensiveRebounds,TotalRebounds,Assists,Steals,Blocks,Turnovers,Target5Years
count,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1329.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0
mean,60.414179,17.624627,6.801493,2.629104,5.885299,44.169403,0.247612,0.779179,19.308126,1.297687,1.82194,70.300299,1.009403,2.025746,3.034478,1.550522,0.618507,0.368582,1.193582,0.620149
std,17.433992,8.307964,4.357545,1.683555,3.593488,6.137679,0.383688,1.061847,16.022916,0.987246,1.322984,10.578479,0.777119,1.360008,2.057774,1.471169,0.409759,0.429049,0.722541,0.485531
min,11.0,3.1,0.7,0.3,0.8,23.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.3,0.0,0.0,0.0,0.1,0.0
25%,47.0,10.875,3.7,1.4,3.3,40.2,0.0,0.0,0.0,0.6,0.9,64.7,0.4,1.0,1.5,0.6,0.3,0.1,0.7,0.0
50%,63.0,16.1,5.55,2.1,4.8,44.1,0.1,0.3,22.4,1.0,1.5,71.25,0.8,1.7,2.5,1.1,0.5,0.2,1.0,1.0
75%,77.0,22.9,8.8,3.4,7.5,47.9,0.4,1.2,32.5,1.6,2.3,77.6,1.4,2.6,4.0,2.0,0.8,0.5,1.5,1.0
max,82.0,40.9,28.2,10.2,19.8,73.7,2.3,6.5,100.0,7.7,10.2,100.0,5.3,9.6,13.9,10.6,2.5,3.9,4.4,1.0


PlayerName                 object
GamesPlayed                 int64
MinutesPerGame            float64
PointsPerGame             float64
FieldGoalsMade            float64
FieldGoalsAttempted       float64
FieldGoalPct              float64
ThreePointersMade         float64
ThreePointersAttempted    float64
ThreePointerPct           float64
FreeThrowsMade            float64
FreeThrowsAttempted       float64
FreeThrowPct              float64
OffensiveRebounds         float64
DefensiveRebounds         float64
TotalRebounds             float64
Assists                   float64
Steals                    float64
Blocks                    float64
Turnovers                 float64
Target5Years              float64
dtype: object

[32m2025-12-14 17:39:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | 1236696816:summarize_dataset:26 | R√©sum√© d√©taill√© du dataset en cours d'analyse...
[32m2025-12-14 17:39:16[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | 1236696816:summarize_dataset:63 | R√©sum√© termin√© : 21 colonnes, 1340 lignes.


Unnamed: 0,Column,Type,Missing,% Missing,Cardinality,Examples
9,ThreePointerPct,float64,11,0.82,254,"[0.0, 22.6, 23.5, 24.4, 25.0]"
0,PlayerName,object,0,0.0,1294,"[Brandon Ingram, Andrew Harrison, JaKarr Samps..."
11,FreeThrowsAttempted,float64,0,0.0,76,"[0.5, 1.3, 1.9, 2.3, 3.4]"
19,Turnovers,float64,0,0.0,41,"[0.7, 0.8, 1.0, 1.3, 1.6]"
18,Blocks,float64,0,0.0,28,"[0.0, 0.1, 0.3, 0.4, 0.5]"
17,Steals,float64,0,0.0,26,"[0.3, 0.4, 0.5, 0.6, 1.1]"
16,Assists,float64,0,0.0,77,"[0.3, 0.8, 1.0, 1.9, 3.7]"
15,TotalRebounds,float64,0,0.0,101,"[1.9, 2.2, 2.4, 2.5, 4.1]"
14,DefensiveRebounds,float64,0,0.0,74,"[0.9, 1.5, 1.7, 2.0, 3.4]"
13,OffensiveRebounds,float64,0,0.0,44,"[0.2, 0.5, 0.7, 0.8, 1.0]"


In [4]:
from pathlib import Path
from nba_longevity.infrastructure.dataset.csv_dataset_loader import CsvDatasetLoader
from nba_longevity.infrastructure.config.settings import load_infra_config

full_path = f"{ROOT_PATH}/config/infra.yaml"
config = load_infra_config(Path(full_path))

loader = CsvDatasetLoader(path=config.paths.raw_data)

dataset = loader.load()

# Le Domain consomme un flux
for i, row in enumerate(dataset):
    # row est un dict-like
    print(row)
    print(len(row))
    if i >= 1:
        break  # Arr√™te la boucle si l'index est 1 ou plus (apr√®s avoir imprim√© 'row')

[32m2025-12-14 17:22:47[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:21 | Loading CSV dataset from path: /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv
[32m2025-12-14 17:22:48[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | csv_dataset_loader:load:25 | CSV loaded successfully | rows=1340 | cols=21
{'PlayerName': 'Brandon Ingram', 'GamesPlayed': 36, 'MinutesPerGame': 27.4, 'PointsPerGame': 7.4, 'FieldGoalsMade': 2.6, 'FieldGoalsAttempted': 7.6, 'FieldGoalPct': 34.7, 'ThreePointersMade': 0.5, 'ThreePointersAttempted': 2.1, 'ThreePointerPct': 25.0, 'FreeThrowsMade': 1.6, 'FreeThrowsAttempted': 2.3, 'FreeThrowPct': 69.9, 'OffensiveRebounds': 0.7, 'DefensiveRebounds': 3.4, 'TotalRebounds': 4.1, 'Assists': 1.9, 'Steals': 0.4, 'Blocks': 0.4, 'Turnovers': 1.3, 'Target5Years': 0.0}
21
{'PlayerName': 'Andrew Harrison', 'GamesPlayed': 35, 'MinutesPerGame': 26.9, 'PointsPerGame': 7.2, 'FieldGo

In [5]:
from pathlib import Path
from nba_longevity.infrastructure.config.settings import load_infra_config

full_path = f"{ROOT_PATH}/config/infra.yaml"
config = load_infra_config(Path(full_path))

print("=== PROJECT ===")
print(f"Name        : {config.project.name}")
print(f"Environment : {config.project.environment}")

print("\n=== PATHS ===")
print(f"Project root : {config.paths.data_dir.parent}")
print(f"Data dir     : {config.paths.data_dir}")
print(f"Raw data     : {config.paths.raw_data}")
print(f"Train data   : {config.paths.train_data}")
print(f"Test data    : {config.paths.test_data}")
print(f"Artifacts    : {config.paths.artifacts_dir}")
print(f"Logs         : {config.paths.logs_dir}")

print("\n=== MLFLOW ===")
print(f"Tracking URI : {config.mlflow.tracking_uri}")
print(f"Experiment   : {config.mlflow.experiment_name}")

print("\n=== RUNTIME ===")
print(f"Random state : {config.runtime.random_state}")
print(f"Log level    : {config.runtime.log_level}")


=== PROJECT ===
Name        : nba_career_longevity
Environment : local

=== PATHS ===
Project root : /Users/surelmanda/Downloads/nba-career-longevity-mlops
Data dir     : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data
Raw data     : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv
Train data   : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/processed/train.csv
Test data    : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/processed/test.csv
Artifacts    : /Users/surelmanda/Downloads/nba-career-longevity-mlops/artifacts
Logs         : /Users/surelmanda/Downloads/nba-career-longevity-mlops/logs

=== MLFLOW ===
Tracking URI : http://localhost:5000
Experiment   : nba_career_longevity

=== RUNTIME ===
Random state : 42
Log level    : INFO


In [4]:
config

InfraConfig(project=ProjectConfig(name='nba_career_longevity', environment='local'), paths=PathsConfig(data_dir=PosixPath('/Users/surelmanda/Downloads/nba-career-longevity-mlops/data'), raw_data=PosixPath('/Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv'), train_data=PosixPath('/Users/surelmanda/Downloads/nba-career-longevity-mlops/data/processed/train.csv'), test_data=PosixPath('/Users/surelmanda/Downloads/nba-career-longevity-mlops/data/processed/test.csv'), artifacts_dir=PosixPath('/Users/surelmanda/Downloads/nba-career-longevity-mlops/artifacts'), logs_dir=PosixPath('/Users/surelmanda/Downloads/nba-career-longevity-mlops/logs')), mlflow=MLflowConfig(tracking_uri='http://localhost:5000', experiment_name='nba_career_longevity'), runtime=RuntimeConfig(random_state=42, log_level='INFO'))

In [2]:
from pathlib import Path
from nba_longevity.infrastructure.config.settings import load_infra_config

def print_section(title: str, data: dict):
    print(f"\n=== {title.upper()} ===")
    for key, value in data.items():
        print(f"{key:<15}: {value}")

full_path = "/Users/surelmanda/Downloads/nba-career-longevity-mlops/config/infra.yaml"
config = load_infra_config(Path(full_path))

print_section("Project", config.project.model_dump())
print_section("Paths", config.paths.model_dump())
print_section("MLflow", config.mlflow.model_dump())
print_section("Runtime", config.runtime.model_dump())


=== PROJECT ===
name           : nba_career_longevity
environment    : local

=== PATHS ===
data_dir       : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data
raw_data       : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/raw/nba_players.csv
train_data     : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/processed/train.csv
test_data      : /Users/surelmanda/Downloads/nba-career-longevity-mlops/data/processed/test.csv
artifacts_dir  : /Users/surelmanda/Downloads/nba-career-longevity-mlops/artifacts
logs_dir       : /Users/surelmanda/Downloads/nba-career-longevity-mlops/logs

=== MLFLOW ===
tracking_uri   : http://localhost:5000
experiment_name: nba_career_longevity

=== RUNTIME ===
random_state   : 42
log_level      : INFO


In [1]:
from nba_longevity.application.bootstrap import app_logger

app_logger.info("Starting inference pipeline")


[32m2025-12-14 14:43:39[0m | [1mINFO[0m | [36mnba_career_longevity[0m | [35mlocal[0m | 2517548038:<module>:3 | Starting inference pipeline


In [None]:
model = run_training(
    csv_path="/Users/surelmanda/Downloads/nba-career-longevity-mlops/data/nba_logreg.csv",
    model_type="xgboost",  # ou "catboost"
)


In [None]:
loader = SparkDatasetLoader(
    spark_session=spark,
    path="nba.csv"
)

dataset = loader.load()

preprocessor = SparkPreprocessingAdapter()
clean_dataset = preprocessor.preprocess(dataset)

for row in clean_dataset:
    print(row)
    break



"""
USE_SPARK = True

if USE_SPARK:
    loader = SparkDatasetLoader(
        spark_session=spark,
        path="hdfs:///data/nba.csv"
    )
else:
    loader = CsvDatasetLoader(
        path="data/nba.csv"
    )

dataset = loader.load()

for row in dataset:
    process(row)

"""