In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # üöï Projet Taxi NYC - Analyse Comparative
# MAGIC ## Statistiques Inf√©rentielles vs Big Data
# MAGIC 
# MAGIC **Entreprise:** DATACO  
# MAGIC **P√©riode:** 2022-2025  
# MAGIC **Bin√¥me:** [Vos noms]  
# MAGIC **Date:** 26-30 janvier 2026

# COMMAND ----------

# MAGIC %md
# MAGIC ## üì¶ 1. Configuration & Imports

# COMMAND ----------

# Imports PySpark
from pyspark.sql.functions import (
    col, count, mean, stddev, sum as spark_sum, min as spark_min, max as spark_max,
    hour, dayofweek, dayofmonth, month, year, weekofyear,
    unix_timestamp, percentile_approx, when, lit
)
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Imports Python scientifique
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import t, norm

# Imports visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
# Configuration visualisation
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Configuration et imports termin√©s")

# COMMAND ----------

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## üìÇ 2. Chargement des Donn√©es

# COMMAND ----------

from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, LongType

# CHEMINS DES DONN√âES
PATH_POPULATION = "/Volumes/workspace/trips/population/"
PATH_SAMPLE = "/Volumes/workspace/trips/sample/"

# ============================================
# CHARGEMENT √âCHANTILLON (CSV)
# ============================================
print("üîÑ Chargement de l'√©chantillon...")
df_sample = spark.read.csv(
    PATH_SAMPLE + "yellowtaxisample1pct_hybrid_stratified.csv",
    header=True,
    inferSchema=True
)
nb_sample = df_sample.count()
print(f"‚úÖ √âchantillon charg√© : {nb_sample:,} courses")

# ============================================
# CHARGEMENT POPULATION - Approche fichier par fichier
# ============================================
print("\nüîÑ Chargement de la population compl√®te...")

# Lister tous les fichiers parquet
files = [f.path for f in dbutils.fs.ls(PATH_POPULATION) if f.path.endswith('.parquet')]
print(f"üìÇ {len(files)} fichiers trouv√©s")

# Charger tous les fichiers avec conversion automatique
dfs = []
for i, file_path in enumerate(files, 1):
    try:
        # Lire le fichier
        df_temp = spark.read.parquet(file_path)
        
        # Convertir les colonnes qui peuvent √™tre INT64 ou DOUBLE
        columns_to_convert = [
            "passenger_count", "trip_distance", "RatecodeID",
            "fare_amount", "extra", "mta_tax", "tip_amount", 
            "tolls_amount", "improvement_surcharge", "total_amount",
            "congestion_surcharge", "airport_fee"
        ]
        
        for col_name in columns_to_convert:
            if col_name in df_temp.columns:
                df_temp = df_temp.withColumn(col_name, col(col_name).cast(DoubleType()))
        
        dfs.append(df_temp)
        print(f"‚úÖ [{i}/{len(files)}] {file_path.split('/')[-1]}")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur sur {file_path.split('/')[-1]}: {str(e)}")

# Fusionner tous les DataFrames
if dfs:
    from functools import reduce
    from pyspark.sql import DataFrame
    
    print("\nüîÑ Fusion de tous les fichiers...")
    df_population = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)
    
    nb_population = df_population.count()
    print(f"‚úÖ Population totale charg√©e : {nb_population:,} courses")
else:
    raise Exception("‚ùå Aucun fichier n'a pu √™tre charg√©")

# V√©rification ratio
ratio = (nb_sample / nb_population) * 100
print(f"\nüìä Ratio √©chantillon/population : {ratio:.2f}%")

# COMMAND ----------

# V√©rification des sch√©mas finaux
print("=== SCH√âMA POPULATION ===")
df_population.printSchema()

print("\n=== SCH√âMA √âCHANTILLON ===")
df_sample.printSchema()

In [0]:


# MAGIC %md
# MAGIC ## üîç 3. EDA - Exploration des Donn√©es

# COMMAND ----------

# MAGIC %md
# MAGIC ### 3.1 Structure des donn√©es

# COMMAND ----------

print("=== SCH√âMA POPULATION ===")
df_population.printSchema()

print("\n=== SCH√âMA √âCHANTILLON ===")
df_sample.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC ### 3.2 Aper√ßu des donn√©es

# COMMAND ----------

print("=== APER√áU POPULATION (5 premi√®res lignes) ===")
display(df_population.limit(5))

print("\n=== APER√áU √âCHANTILLON (5 premi√®res lignes) ===")
display(df_sample.limit(5))

# COMMAND ----------

# MAGIC %md
# MAGIC ### 3.3 Statistiques descriptives de base

# COMMAND ----------

# Population
print("=== STATISTIQUES DESCRIPTIVES - POPULATION ===")
stats_pop = df_population.select(
    "fare_amount", "trip_distance", "tip_amount", 
    "tolls_amount", "total_amount", "passenger_count"
).describe()
display(stats_pop)

# √âchantillon
print("\n=== STATISTIQUES DESCRIPTIVES - √âCHANTILLON ===")
stats_sample = df_sample.select(
    "fare_amount", "trip_distance", "tip_amount", 
    "tolls_amount", "total_amount", "passenger_count"
).describe()
display(stats_sample)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 3.4 V√©rification des valeurs manquantes

# COMMAND ----------
from pyspark.sql.functions import col, sum as spark_sum, isnan, when, count
# Fonction pour compter valeurs manquantes
def count_nulls(df, dataset_name):
    print(f"\n=== VALEURS MANQUANTES - {dataset_name} ===")
    null_counts = df.select([
        spark_sum(col(c).isNull().cast("int")).alias(c) 
        for c in df.columns
    ])
    
    # Conversion en pandas pour affichage plus lisible
    null_df = null_counts.toPandas().T
    null_df.columns = ['Nb_Nulls']
    null_df['Pct_Nulls'] = (null_df['Nb_Nulls'] / df.count() * 100).round(2)
    null_df = null_df[null_df['Nb_Nulls'] > 0].sort_values('Nb_Nulls', ascending=False)
    
    if len(null_df) > 0:
        print(null_df)
    else:
        print("‚úÖ Aucune valeur manquante d√©tect√©e")
    
    return null_df

# V√©rification
nulls_pop = count_nulls(df_population, "POPULATION")
nulls_sample = count_nulls(df_sample, "√âCHANTILLON")

# COMMAND ----------

# MAGIC %md
# MAGIC ### 3.5 D√©tection pr√©liminaire des outliers

# COMMAND ----------

# Calcul des quartiles pour fare_amount
quantiles_fare = df_population.approxQuantile("fare_amount", [0.01, 0.25, 0.50, 0.75, 0.99], 0.01)
Q1, Q3 = quantiles_fare[1], quantiles_fare[3]
IQR = Q3 - Q1

print("=== ANALYSE FARE_AMOUNT ===")
print(f"Q1 (25%): ${Q1:.2f}")
print(f"M√©diane (50%): ${quantiles_fare[2]:.2f}")
print(f"Q3 (75%): ${Q3:.2f}")
print(f"IQR: ${IQR:.2f}")
print(f"1er percentile: ${quantiles_fare[0]:.2f}")
print(f"99e percentile: ${quantiles_fare[4]:.2f}")

# Limites outliers (m√©thode IQR)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"\nLimites outliers (IQR ¬±1.5):")
print(f"Limite inf√©rieure: ${lower_bound:.2f}")
print(f"Limite sup√©rieure: ${upper_bound:.2f}")

# Comptage outliers
nb_outliers = df_population.filter(
    (col("fare_amount") < lower_bound) | (col("fare_amount") > upper_bound)
).count()
pct_outliers = (nb_outliers / nb_population) * 100

print(f"\nüìä Outliers d√©tect√©s: {nb_outliers:,} ({pct_outliers:.2f}%)")

# COMMAND ----------

In [0]:
# MAGIC %md
# MAGIC ## üìà 4. Analyse par Statistiques Inf√©rentielles (√âchantillon)

# COMMAND ----------
import numpy as np
import pandas as pd
from scipy.stats import t, norm
import scipy.stats as stats
from pyspark.sql.functions import col, count, mean, stddev
from pyspark.sql.functions import unix_timestamp

# MAGIC %md
# MAGIC ### 4.1 Prix moyen avec intervalle de confiance

# COMMAND ----------

# Conversion en pandas pour calculs statistiques
fare_sample_pd = df_sample.select("fare_amount").toPandas()['fare_amount']

# Statistiques de base
n = len(fare_sample_pd)
mean_fare = fare_sample_pd.mean()
std_fare = fare_sample_pd.std()
se_fare = std_fare / np.sqrt(n)

# Intervalle de confiance 95% (distribution t de Student)
confidence_level = 0.95
alpha = 1 - confidence_level
df_freedom = n - 1
t_critical = t.ppf(1 - alpha/2, df_freedom)

margin_error = t_critical * se_fare
ic_inf = mean_fare - margin_error
ic_sup = mean_fare + margin_error

print("=== PRIX MOYEN (FARE_AMOUNT) - INF√âRENCE ===")
print(f"Taille √©chantillon: {n:,}")
print(f"Prix moyen estim√©: ${mean_fare:.2f}")
print(f"√âcart-type: ${std_fare:.2f}")
print(f"Erreur standard: ${se_fare:.4f}")
print(f"t-critique (95%): {t_critical:.3f}")
print(f"Marge d'erreur: ${margin_error:.2f}")
print(f"\nüéØ IC 95%: [${ic_inf:.2f}, ${ic_sup:.2f}]")

# Stockage pour comparaison
results_inferential = {
    'mean_fare': mean_fare,
    'ic_fare_inf': ic_inf,
    'ic_fare_sup': ic_sup
}

# COMMAND ----------

# MAGIC %md
# MAGIC ### 4.2 Distance moyenne avec IC

# COMMAND ----------

# Distance moyenne
distance_sample_pd = df_sample.select("trip_distance").toPandas()['trip_distance']

mean_distance = distance_sample_pd.mean()
std_distance = distance_sample_pd.std()
se_distance = std_distance / np.sqrt(n)
margin_error_dist = t_critical * se_distance

ic_dist_inf = mean_distance - margin_error_dist
ic_dist_sup = mean_distance + margin_error_dist

print("=== DISTANCE MOYENNE (TRIP_DISTANCE) - INF√âRENCE ===")
print(f"Distance moyenne estim√©e: {mean_distance:.2f} miles")
print(f"√âcart-type: {std_distance:.2f}")
print(f"üéØ IC 95%: [{ic_dist_inf:.2f}, {ic_dist_sup:.2f}] miles")

results_inferential['mean_distance'] = mean_distance
results_inferential['ic_dist_inf'] = ic_dist_inf
results_inferential['ic_dist_sup'] = ic_dist_sup

# COMMAND ----------

# MAGIC %md
# MAGIC ### 4.3 Dur√©e moyenne avec IC

# COMMAND ----------

# Calcul de la dur√©e en minutes
df_sample_duration = df_sample.withColumn(
    "duration_minutes",
    (col("tpep_dropoff_datetime").cast("long")
     - col("tpep_pickup_datetime").cast("long")) / 60
)

duration_sample_pd = df_sample_duration.select("duration_minutes").toPandas()['duration_minutes']

mean_duration = duration_sample_pd.mean()
std_duration = duration_sample_pd.std()
se_duration = std_duration / np.sqrt(n)
margin_error_dur = t_critical * se_duration

ic_dur_inf = mean_duration - margin_error_dur
ic_dur_sup = mean_duration + margin_error_dur

print("=== DUR√âE MOYENNE - INF√âRENCE ===")
print(f"Dur√©e moyenne estim√©e: {mean_duration:.2f} minutes")
print(f"√âcart-type: {std_duration:.2f}")
print(f"üéØ IC 95%: [{ic_dur_inf:.2f}, {ic_dur_sup:.2f}] minutes")

results_inferential['mean_duration'] = mean_duration
results_inferential['ic_dur_inf'] = ic_dur_inf
results_inferential['ic_dur_sup'] = ic_dur_sup

# COMMAND ----------

# MAGIC %md
# MAGIC ### 4.4 Proportion avec tip > 0 et IC

# COMMAND ----------

# Proportion dans l'√©chantillon
nb_with_tip = df_sample.filter(col("tip_amount") > 0).count()
p_sample = nb_with_tip / n

# IC pour proportion (approximation normale)
z_critical = norm.ppf(1 - alpha/2)  # 1.96 pour 95%
se_prop = np.sqrt(p_sample * (1 - p_sample) / n)
margin_error_prop = z_critical * se_prop

ic_prop_inf = max(0, p_sample - margin_error_prop)
ic_prop_sup = min(1, p_sample + margin_error_prop)

print("=== PROPORTION AVEC TIP > 0 - INF√âRENCE ===")
print(f"Nombre avec tip: {nb_with_tip:,}")
print(f"Proportion estim√©e: {p_sample:.2%}")
print(f"z-critique (95%): {z_critical:.3f}")
print(f"üéØ IC 95%: [{ic_prop_inf:.2%}, {ic_prop_sup:.2%}]")

results_inferential['prop_tip'] = p_sample
results_inferential['ic_prop_inf'] = ic_prop_inf
results_inferential['ic_prop_sup'] = ic_prop_sup

# COMMAND ----------

# MAGIC %md
# MAGIC ### 4.5 Ratio tip/fare moyen par type de paiement

# COMMAND ----------

# Calcul du ratio tip/fare pour √©chantillon
df_sample_ratio = df_sample.filter(
    (col("fare_amount") > 0) & (col("tip_amount") >= 0)
).withColumn(
    "tip_ratio", col("tip_amount") / col("fare_amount")
)

# Groupement par type de paiement
ratio_by_payment = df_sample_ratio.groupBy("payment_type").agg(
    count("*").alias("nb_courses"),
    mean("tip_ratio").alias("ratio_tip_fare_moyen"),
    stddev("tip_ratio").alias("std_ratio")
).orderBy(col("nb_courses").desc())

print("=== RATIO TIP/FARE PAR TYPE DE PAIEMENT - INF√âRENCE ===")
display(ratio_by_payment)

# Pour chaque type de paiement, calculer IC
ratio_pd = df_sample_ratio.select("payment_type", "tip_ratio").toPandas()
for payment_type in ratio_pd['payment_type'].unique():
    data = ratio_pd[ratio_pd['payment_type'] == payment_type]['tip_ratio']
    if len(data) > 30:  # Nombre suffisant
        mean_r = data.mean()
        std_r = data.std()
        n_r = len(data)
        se_r = std_r / np.sqrt(n_r)
        me_r = t.ppf(0.975, n_r-1) * se_r
        print(f"Payment {payment_type}: {mean_r:.2%} ¬± {me_r:.2%}")

# COMMAND ----------

# MAGIC %md
# MAGIC ### 4.6 Test d'hypoth√®se : Comparaison Manhattan vs Brooklyn

# COMMAND ----------

from pyspark.sql.functions import col
import scipy.stats as stats

# Filtrage des prix valides
card_fares = df_sample.filter(
    (col("payment_type") == 1) & (col("fare_amount") > 0)
).select("fare_amount").toPandas()['fare_amount']

cash_fares = df_sample.filter(
    (col("payment_type") == 2) & (col("fare_amount") > 0)
).select("fare_amount").toPandas()['fare_amount']

# Test t de Student (√©chantillons ind√©pendants)
if len(card_fares) > 30 and len(cash_fares) > 30:
    t_stat, p_value = stats.ttest_ind(card_fares, cash_fares, equal_var=False)

    print("=== TEST D'HYPOTH√àSE : CARTE vs CASH ===")
    print(f"Carte - Prix moyen: ${card_fares.mean():.2f} (n={len(card_fares)})")
    print(f"Cash  - Prix moyen: ${cash_fares.mean():.2f} (n={len(cash_fares)})")
    print(f"\nt-statistique: {t_stat:.3f}")
    print(f"p-value: {p_value:.6f}")

    if p_value < 0.05:
        print("\n‚úÖ Diff√©rence SIGNIFICATIVE (p < 0.05)")
        print("üëâ Le mode de paiement influence le prix moyen de la course")
    else:
        print("\n‚ùå Diff√©rence NON significative (p ‚â• 0.05)")
        print("üëâ Aucune preuve d‚Äôun effet du mode de paiement")
else:
    print("‚ö†Ô∏è Taille d‚Äô√©chantillon insuffisante pour le test")


# COMMAND ----------

In [0]:
# MAGIC %md
# MAGIC ## üíæ 5. Analyse Big Data (Population Compl√®te)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5.1 Calcul des m√©triques EXACTES

# COMMAND ----------

import time

# Mesure du temps de calcul
start_time = time.time()

print("=== CALCUL DES M√âTRIQUES EXACTES (POPULATION) ===")
print("üîÑ Calcul en cours...")

# Prix moyen exact
mean_fare_exact = df_population.agg(mean("fare_amount")).collect()[0][0]
print(f"‚úÖ Prix moyen EXACT: ${mean_fare_exact:.2f}")

# Distance moyenne exacte
mean_distance_exact = df_population.agg(mean("trip_distance")).collect()[0][0]
print(f"‚úÖ Distance moyenne EXACTE: {mean_distance_exact:.2f} miles")

# Dur√©e moyenne exacte
from pyspark.sql.functions import col, mean

# 1Ô∏è‚É£ Cr√©ation de la colonne dur√©e en minutes
from pyspark.sql.functions import col

df_population_duration = df_population_duration.withColumn(
    "duration_minutes",
    (col("tpep_dropoff_datetime").cast("long") - col("tpep_pickup_datetime").cast("long")) / 60
)

# 2Ô∏è‚É£ V√©rification rapide
df_population_duration.select("duration_minutes").show(5)

# 3Ô∏è‚É£ Calcul de la moyenne exacte
mean_duration_exact = df_population_duration.agg(
    mean("duration_minutes")
).collect()[0][0]

print(f"Dur√©e moyenne exacte (population) : {mean_duration_exact:.2f} minutes")


# Proportion exacte avec tip
nb_with_tip_exact = df_population.filter(col("tip_amount") > 0).count()
prop_tip_exact = nb_with_tip_exact / nb_population
print(f"‚úÖ Proportion EXACTE avec tip: {prop_tip_exact:.2%}")

# Temps de calcul
elapsed_time = time.time() - start_time
print(f"\n‚è±Ô∏è Temps de calcul: {elapsed_time:.2f} secondes")

# Stockage pour comparaison
results_bigdata = {
    'mean_fare': mean_fare_exact,
    'mean_distance': mean_distance_exact,
    'mean_duration': mean_duration_exact,
    'prop_tip': prop_tip_exact,
    'compute_time': elapsed_time
}

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5.2 Distribution temporelle - Heures de pointe

# COMMAND ----------

# Ajout des colonnes temporelles
df_population_time = df_population.withColumn("hour", hour("pickup_datetime")) \
                                   .withColumn("dayofweek", dayofweek("pickup_datetime")) \
                                   .withColumn("day", dayofmonth("pickup_datetime")) \
                                   .withColumn("month", month("pickup_datetime"))

# Analyse par heure
courses_par_heure = df_population_time.groupBy("hour").agg(
    count("*").alias("nb_courses"),
    mean("fare_amount").alias("fare_moyen"),
    mean("trip_distance").alias("distance_moyenne")
).orderBy("hour")

print("=== DISTRIBUTION PAR HEURE ===")
display(courses_par_heure)

# Identification heures de pointe
print("\n=== TOP 5 HEURES DE POINTE ===")
heures_pointe = courses_par_heure.orderBy(col("nb_courses").desc()).limit(5)
display(heures_pointe)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5.3 Distribution par jour de la semaine

# COMMAND ----------

# Analyse par jour de semaine (1=Dimanche, 7=Samedi)
courses_par_jour = df_population_time.groupBy("dayofweek").agg(
    count("*").alias("nb_courses"),
    mean("fare_amount").alias("fare_moyen")
).orderBy("dayofweek")

# Ajout noms jours
jours_mapping = {
    1: "Dimanche", 2: "Lundi", 3: "Mardi", 4: "Mercredi",
    5: "Jeudi", 6: "Vendredi", 7: "Samedi"
}

print("=== DISTRIBUTION PAR JOUR DE LA SEMAINE ===")
display(courses_par_jour)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5.4 Analyse g√©ographique par boroughs

# COMMAND ----------

# Analyse compl√®te par borough
analyse_boroughs = df_population.groupBy("pickup_borough").agg(
    count("*").alias("nb_courses"),
    mean("fare_amount").alias("fare_moyen"),
    mean("trip_distance").alias("distance_moyenne"),
    mean("tip_amount").alias("tip_moyen"),
    (mean("tip_amount") / mean("fare_amount") * 100).alias("tip_pct_fare")
).orderBy(col("nb_courses").desc())

print("=== ANALYSE PAR BOROUGH (PICKUP) ===")
display(analyse_boroughs)

# Statistiques par paire origine-destination
top_routes = df_population.groupBy("pickup_borough", "dropoff_borough").agg(
    count("*").alias("nb_courses"),
    mean("fare_amount").alias("fare_moyen"),
    mean("trip_distance").alias("distance_moyenne")
).orderBy(col("nb_courses").desc()).limit(10)

print("\n=== TOP 10 ROUTES (ORIGINE ‚Üí DESTINATION) ===")
display(top_routes)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5.5 D√©tection et analyse des outliers

# COMMAND ----------

# Calcul des percentiles pour plusieurs variables
print("=== ANALYSE DES OUTLIERS ===")

# Fare amount
quantiles_fare_full = df_population.approxQuantile("fare_amount", [0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99], 0.01)
Q1_fare, Q3_fare = quantiles_fare_full[2], quantiles_fare_full[4]
IQR_fare = Q3_fare - Q1_fare
lower_bound_fare = Q1_fare - 1.5 * IQR_fare
upper_bound_fare = Q3_fare + 1.5 * IQR_fare

print(f"\n--- FARE_AMOUNT ---")
print(f"1er percentile: ${quantiles_fare_full[0]:.2f}")
print(f"5e percentile: ${quantiles_fare_full[1]:.2f}")
print(f"95e percentile: ${quantiles_fare_full[5]:.2f}")
print(f"99e percentile: ${quantiles_fare_full[6]:.2f}")
print(f"Limites IQR: [${lower_bound_fare:.2f}, ${upper_bound_fare:.2f}]")

# Comptage outliers
outliers_fare = df_population.filter(
    (col("fare_amount") < lower_bound_fare) | (col("fare_amount") > upper_bound_fare)
)
nb_outliers_fare = outliers_fare.count()
pct_outliers_fare = (nb_outliers_fare / nb_population) * 100

print(f"Outliers: {nb_outliers_fare:,} ({pct_outliers_fare:.2f}%)")

# Analyse des outliers extr√™mes
print("\n--- COURSES AVEC PRIX EXTR√äMES ---")
extremes_high = df_population.filter(col("fare_amount") > quantiles_fare_full[6]).select(
    "fare_amount", "trip_distance", "pickup_borough", "dropoff_borough"
).limit(10)
display(extremes_high)

# COMMAND ----------

# MAGIC %md
# MAGIC ### 5.6 Ratio tip/fare par type de paiement (population)

# COMMAND ----------

# Calcul exact pour toute la population
df_population_ratio = df_population.filter(
    (col("fare_amount") > 0) & (col("tip_amount") >= 0)
).withColumn(
    "tip_ratio", col("tip_amount") / col("fare_amount")
)

ratio_by_payment_exact = df_population_ratio.groupBy("payment_type").agg(
    count("*").alias("nb_courses"),
    mean("tip_ratio").alias("ratio_tip_fare_moyen"),
    stddev("tip_ratio").alias("std_ratio"),
    mean("tip_amount").alias("tip_moyen"),
    mean("fare_amount").alias("fare_moyen")
).orderBy(col("nb_courses").desc())

print("=== RATIO TIP/FARE PAR TYPE DE PAIEMENT - POPULATION EXACTE ===")
display(ratio_by_payment_exact)

# COMMAND ----------