In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# --- Spark ---
spark = SparkSession.builder.appName("count_test").getOrCreate()

# --- Colonnes ---
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
    "Order Region", "Category Name"
]

# --- Charger donn√©es ---
df = (
    spark.read.option("header", True).option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*(feature_numeric_cols + feature_categorical_cols + ["Late_delivery_risk"]))
).dropna()


for c in feature_categorical_cols:
    print(c, df.select(c).distinct().count())


In [12]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# üîπ Cr√©er SparkSession
spark = SparkSession.builder.appName("RF_test").getOrCreate()

# üîπ Colonnes
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
    "Order Region", "Category Name"
]

# üîπ Charger les donn√©es
df = (
    spark.read.option("header", True).option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*(feature_numeric_cols + feature_categorical_cols + ["Late_delivery_risk"]))
).dropna()

# üîπ Pr√©traitement
indexers = [StringIndexer(inputCol=c, outputCol=c+"_indexed", handleInvalid="keep") for c in feature_categorical_cols]

assembler_num = VectorAssembler(inputCols=feature_numeric_cols, outputCol="num_features")
scaler = StandardScaler(inputCol="num_features", outputCol="num_features_scaled", withMean=True, withStd=True)
assembler_final = VectorAssembler(inputCols=["num_features_scaled"] + [c+"_indexed" for c in feature_categorical_cols], outputCol="features")

# üîπ Mod√®le
rf = RandomForestClassifier(featuresCol="features", labelCol="Late_delivery_risk", maxBins=2000)

pipeline = Pipeline(stages=indexers + [assembler_num, scaler, assembler_final, rf])

# üîπ Train/Test split
train, test = df.randomSplit([0.8, 0.2], seed=42)

# üîπ Entra√Ænement et pr√©diction
model = pipeline.fit(train)
pred = model.transform(test)

# üîπ √âvaluation
evaluator = BinaryClassificationEvaluator(labelCol="Late_delivery_risk", rawPredictionCol="rawPrediction")
print("Random Forest - AUC =", evaluator.evaluate(pred))

# üîπ Stop Spark pour lib√©rer m√©moire
spark.stop()


Random Forest - AUC = 0.7284820922900427


In [11]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# --- Spark ---
spark = SparkSession.builder.appName("GBT_test").getOrCreate()

# --- Colonnes ---
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
     "Order Region", "Category Name"
]

# --- Charger donn√©es ---
df = (
    spark.read.option("header", True).option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*(feature_numeric_cols + feature_categorical_cols + ["Late_delivery_risk"]))
).dropna()

# --- Encodage simple ---
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in feature_categorical_cols]

# --- Assemblage simple ---
assembler = VectorAssembler(
    inputCols=feature_numeric_cols + [c+"_idx" for c in feature_categorical_cols],
    outputCol="features"
)

# --- Mod√®le seul ---
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="Late_delivery_risk",
    maxIter=30,
    maxDepth=5,
    maxBins=2000
    
)

pipeline = Pipeline(stages=indexers + [assembler, gbt])

# --- Train/Test ---
train, test = df.randomSplit([0.8, 0.2], seed=42)

# --- Train ---
model = pipeline.fit(train)

# --- Pr√©diction ---
pred = model.transform(test)

# --- AUC ---
evaluator = BinaryClassificationEvaluator(
    labelCol="Late_delivery_risk",
    rawPredictionCol="rawPrediction"
)

print("GBT AUC =", evaluator.evaluate(pred))

spark.stop()


GBT AUC = 0.7437711704542939


Type 4
Shipping Mode 4
Market 5
Customer Segment 3
Order Region 23
Category Name 50


In [14]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# --- Spark ---
spark = SparkSession.builder.appName("ML_Models_Test").getOrCreate()

# --- Colonnes num√©riques et cat√©gorielles ---
feature_numeric_cols = [
    "Days for shipment (scheduled)", "Benefit per order", "Sales per customer",
    "Order Item Discount", "Order Item Discount Rate", "Order Item Product Price",
    "Order Item Profit Ratio", "Order Item Quantity", "Sales", "Order Profit Per Order"
]

feature_categorical_cols = [
    "Type", "Shipping Mode", "Market", "Customer Segment",
    "Order Region", "Category Name"   # Exclu "Order State" qui avait trop de valeurs
]

# --- Charger donn√©es ---
df = (
    spark.read.option("header", True).option("inferSchema", True)
    .csv("../data/DataCoSupplyChainDataset.csv")
    .filter(col("Delivery Status") != "Shipping canceled")
    .select(*(feature_numeric_cols + feature_categorical_cols + ["Late_delivery_risk"]))
).dropna()

# --- Encodage simple ---
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in feature_categorical_cols]

# --- Assemblage ---
assembler = VectorAssembler(
    inputCols=feature_numeric_cols + [c+"_idx" for c in feature_categorical_cols],
    outputCol="features"
)

# --- Train/Test ---
train, test = df.randomSplit([0.8, 0.2], seed=42)

# --- Liste des mod√®les √† tester ---
models = {
    "GBTClassifier": GBTClassifier(featuresCol="features", labelCol="Late_delivery_risk", maxIter=30, maxDepth=5, maxBins=2000),
    "RandomForest": RandomForestClassifier(featuresCol="features", labelCol="Late_delivery_risk", numTrees=50, maxDepth=5, maxBins=2000),
    "LogisticRegression": LogisticRegression(featuresCol="features", labelCol="Late_delivery_risk", maxIter=50)
}

# --- Fonction d'√©valuation ---
def evaluate_model(model_name, pipeline_model):
    pred = pipeline_model.transform(test)
    
    evaluator_acc = MulticlassClassificationEvaluator(labelCol="Late_delivery_risk", predictionCol="prediction", metricName="accuracy")
    evaluator_f1 = MulticlassClassificationEvaluator(labelCol="Late_delivery_risk", predictionCol="prediction", metricName="f1")
    evaluator_prec = MulticlassClassificationEvaluator(labelCol="Late_delivery_risk", predictionCol="prediction", metricName="weightedPrecision")
    evaluator_recall = MulticlassClassificationEvaluator(labelCol="Late_delivery_risk", predictionCol="prediction", metricName="weightedRecall")
    evaluator_auc = BinaryClassificationEvaluator(labelCol="Late_delivery_risk", rawPredictionCol="rawPrediction")
    
    print(f"\n=== {model_name} ===")
    print("Accuracy =", evaluator_acc.evaluate(pred))
    print("F1-score =", evaluator_f1.evaluate(pred))
    print("Precision =", evaluator_prec.evaluate(pred))
    print("Recall =", evaluator_recall.evaluate(pred))
    print("AUC =", evaluator_auc.evaluate(pred))

# --- Entra√Ænement et √©valuation mod√®le par mod√®le ---
for name, classifier in models.items():
    pipeline = Pipeline(stages=indexers + [assembler, classifier])
    model = pipeline.fit(train)
    evaluate_model(name, model)

# --- Stop Spark ---
spark.stop()



=== GBTClassifier ===
Accuracy = 0.697117903930131
F1-score = 0.6936591440259341
Precision = 0.7463293681199681
Recall = 0.697117903930131
AUC = 0.7437733536319574

=== RandomForest ===
Accuracy = 0.6940902474526929
F1-score = 0.6923440477551527
Precision = 0.7323106600794665
Recall = 0.6940902474526929
AUC = 0.728468255710722

=== LogisticRegression ===
Accuracy = 0.6939737991266376
F1-score = 0.6922342061251481
Precision = 0.7321398439841479
Recall = 0.6939737991266376
AUC = 0.7177301771967515
