In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
 
# Load dataset
df = sns.load_dataset("tips")

# Apperçu 

print(df.head())
print(df.info())
 
# Drop missing values

# Supprimer les valeurs manquantes

df.dropna(inplace=True)

# Identifier les types de variables

quantitative = ["total_bil", "tip", "size"]
qualitative = ["sex", "day", "time"]

 
# === STEP 1: CENTRAL TENDENCY MEASURES ===
# Find mean of total_bill
# Find median of total_bill
# Find mode of total_bill

for col in quantitative:
    print(f"{col} - Moyenne : {df[col].mean():.2f}, Médiane : {df[col].median():.2f}, Mode : {df[col].mode()[0]:.2f}")
df.groupby("sex")[quantitative].mean()    
 
# === STEP 2: DISPERSION MEASURES ===
# Find range (max - min)
# Find variance
# Find standard deviation
# Find IQR (Q3 - Q1)

for col in quantitative:
    print(f"{col} - Min : {df[col].min()}, Max : {df[col].max()}, Ecart : {df[col].max() - df[col].min()}")
    print(f"{col} - Varience : {df[col].var():.2f}, Ecart-type : {df[col].std():.2f}")
    print(f"{col} - IQR : {df[col].quantile(0.75) - df[col].quantile(0.25):.2f}")
    
 
# === STEP 3: VISUALIZATION ===
# Histogram of total_bill
# Boxplot of total_bill by sex
# Bar chart of transactions per day
# Scatter plot: size vs total_bill

sns.histplot(df["total_bill"], kde=True)
plt.title("Distribution du montant total")
plt.show()

sns.boxplot(x="sex", y="total_bill", data=df)
plt.title("Dépenses par sexe")
plt.show()

sns.scatterplot(x="size", y="total_bill", data=df)
plt.title("Taille du groupe vs montant total")
plt.show()



    
 
# === STEP 4: CORRELATION ===
# Compute Pearson correlation between size and total_bill

corr_matrix = df[["total_bill", "tip", "size"]].corr(method="pearson")
print(corr_matrix)
plt.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Corrélations entre variables")
plt.show()

# === STEP 5: SYNTHESE

# Dépenses moyennes par type de client

print("Dépenses moyennes PAR sexe :")
print(df.groupby("sex")["total_bill"].mean())

print("Jour avec le plus de transactions :")
print(df["day"].value_counts().indxmax())

print("Corrélation la plus forte :")
print(corr_matrix.unstack().sort_values(ascending=False).drop_duplicates()[1])
