In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col, expr
import json

# ---------- Winsorizer ----------
class Winsorizer(Transformer):
    def __init__(self, inputCols=None, lower=0.01, upper=0.99):
        super(Winsorizer, self).__init__()
        self.inputCols = inputCols
        self.lower = lower
        self.upper = upper
    
    def _transform(self, df):
        for c in self.inputCols:
            low, up = df.approxQuantile(c, [self.lower, self.upper], 0.01)
            df = df.withColumn(
                c,
                expr(
                    f"CASE WHEN `{c}` < {low} THEN {low} "
                    f"WHEN `{c}` > {up} THEN {up} ELSE `{c}` END"
                )
            )
        return df

# ---------- Spark ----------
spark = SparkSession.builder.appName("ML_Models_Test").getOrCreate()

# ---------- Colonnes ----------
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
    "Order Region", "Category Name"
]

# ---------- Data ----------
df = (
    spark.read.option("header", True).option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*(feature_numeric_cols + feature_categorical_cols + ["Late_delivery_risk"]))
).dropna()

# ---------- Apply Winsorizer BEFORE pipeline ----------
winsor = Winsorizer(inputCols=feature_numeric_cols, lower=0.01, upper=0.99)
df = winsor.transform(df)

# ---------- Prétraitement ----------
indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep")
    for c in feature_categorical_cols
]

assembler_num = VectorAssembler(inputCols=feature_numeric_cols, outputCol="num_features")

scaler = StandardScaler(inputCol="num_features", outputCol="num_features_scaled",
                        withMean=True, withStd=True)

assembler_final = VectorAssembler(
    inputCols=["num_features_scaled"] + [c+"_idx" for c in feature_categorical_cols],
    outputCol="features"
)

# ---------- Modèles ----------
models = {
    "LogisticRegression": LogisticRegression(featuresCol="features", labelCol="Late_delivery_risk", maxIter=50),
}

# ---------- Train/Test ----------
train, test = df.randomSplit([0.8, 0.2], seed=42)

# ---------- Entraînement ----------
classifier = models["LogisticRegression"]
pipeline = Pipeline(stages=indexers + [assembler_num, scaler, assembler_final, classifier])
best_model = pipeline.fit(train)

# ---------- Sauvegarde modèle OK ----------
model_path = "../models/logistic_regression_pipeline"
best_model.write().overwrite().save(model_path)
print("✓ Pipeline sauvegardé avec succès!")

# ---------- Sauvegarde Metadata ----------
metadata = {
    "model_type": "LogisticRegression",
    "feature_numeric_cols": feature_numeric_cols,
    "feature_categorical_cols": feature_categorical_cols
}

with open("../models/model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("✓ Métadonnées sauvegardées !")

spark.stop()


✓ Pipeline sauvegardé avec succès!
✓ Métadonnées sauvegardées !
