In [1]:
"""
iris_functional_pipeline.py

Pipeline de traitement fonctionnel du dataset Iris
Respect des principes de la programmation fonctionnelle :
- Fonctions pures
- Immutabilité
- Abstractions fonctionnelles : map, filter, reduce
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from functools import reduce
from typing import Callable, List


# -------------------------------
# 1. Chargement des données (fonction pure)
# -------------------------------
def load_data(path: str) -> pd.DataFrame:
    return pd.read_csv(path)


# -------------------------------
# 2. Nettoyage des données
# -------------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna()


# -------------------------------
# 3. Mapping des features numériques (immuable)
# -------------------------------
def extract_numeric_features(df: pd.DataFrame) -> pd.DataFrame:
    return df.select_dtypes(include=['float64'])


# -------------------------------
# 4. Agrégation fonctionnelle : moyenne des colonnes
# -------------------------------
def average_column(col: pd.Series) -> float:
    return col.mean()


def average_all_columns(df: pd.DataFrame) -> dict:
    return dict(map(lambda col: (col, average_column(df[col])), df.columns))


# -------------------------------
# 5. Visualisation fonctionnelle
# -------------------------------
def plot_correlation_heatmap(df: pd.DataFrame, save_path="correlation_heatmap.png") -> None:
    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title("Matrice de corrélation")
    plt.tight_layout()
    plt.savefig(save_path)
    plt.clf()


def plot_scatter(df: pd.DataFrame, save_path="scatter_plot.png") -> None:
    sns.pairplot(df)
    plt.suptitle("Scatter matrix des features", y=1.02)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.clf()


# -------------------------------
# 6. Clustering (k-means)
# -------------------------------
def perform_clustering(df: pd.DataFrame, n_clusters: int = 3) -> List[int]:
    model = KMeans(n_clusters=n_clusters, random_state=42)
    model.fit(df)
    return model.labels_


# -------------------------------
# 7. Pipeline principal
# -------------------------------
def iris_pipeline(csv_path: str) -> None:
    # Étapes fonctionnelles chaînées
    df = load_data(csv_path)
    df = clean_data(df)
    df_numeric = extract_numeric_features(df)

    # Statistiques
    averages = average_all_columns(df_numeric)
    print("Moyennes des colonnes :")
    for col, val in averages.items():
        print(f"{col} : {val:.2f}")

    # Visualisations
    plot_correlation_heatmap(df_numeric)
    plot_scatter(df_numeric)

    # Clustering
    labels = perform_clustering(df_numeric)
    df['Cluster'] = labels

    # Sauvegarde du résultat
    df.to_csv("iris_with_clusters.csv", index=False)
    print("Pipeline terminé. Résultats enregistrés dans 'iris_with_clusters.csv'")


# -------------------------------
# Exécution
# -------------------------------
if __name__ == "__main__":
    iris_pipeline("Iris.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'Iris.csv'

In [2]:
"""
iris_functional_pipeline.py

Pipeline de traitement fonctionnel du dataset Iris
Respect des principes de la programmation fonctionnelle :
- Fonctions pures
- Immutabilité
- Abstractions fonctionnelles : map, filter, reduce
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from functools import reduce
from typing import Callable, List
import requests # Import the requests library

# -------------------------------
# Helper function to download the dataset
# -------------------------------
def download_iris_dataset(url: str, filename: str) -> None:
    """Downloads the Iris dataset from a given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Successfully downloaded {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the dataset: {e}")


# -------------------------------
# 1. Chargement des données (fonction pure)
# -------------------------------
def load_data(path: str) -> pd.DataFrame:
    return pd.read_csv(path)


# -------------------------------
# 2. Nettoyage des données
# -------------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna()


# -------------------------------
# 3. Mapping des features numériques (immuable)
# -------------------------------
def extract_numeric_features(df: pd.DataFrame) -> pd.DataFrame:
    return df.select_dtypes(include=['float64', 'int64']) # Include int64 for the ID column


# -------------------------------
# 4. Agrégation fonctionnelle : moyenne des colonnes
# -------------------------------
def average_column(col: pd.Series) -> float:
    return col.mean()


def average_all_columns(df: pd.DataFrame) -> dict:
    # Filter out non-numeric columns before calculating averages
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    return dict(map(lambda col: (col, average_column(numeric_df[col])), numeric_df.columns))


# -------------------------------
# 5. Visualisation fonctionnelle
# -------------------------------
def plot_correlation_heatmap(df: pd.DataFrame, save_path="correlation_heatmap.png") -> None:
    # Ensure only numeric data is used for correlation matrix
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title("Matrice de corrélation")
    plt.tight_layout()
    plt.savefig(save_path)
    plt.clf()


def plot_scatter(df: pd.DataFrame, save_path="scatter_plot.png") -> None:
    # Remove non-numeric columns for pairplot
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    sns.pairplot(numeric_df)
    plt.suptitle("Scatter matrix des features", y=1.02)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.clf()


# -------------------------------
# 6. Clustering (k-means)
# -------------------------------
def perform_clustering(df: pd.DataFrame, n_clusters: int = 3) -> List[int]:
    # Use only numeric features for clustering
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # Added n_init for newer KMeans versions
    model.fit(numeric_df)
    return model.labels_


# -------------------------------
# 7. Pipeline principal
# -------------------------------
def iris_pipeline(csv_path: str) -> None:
    # Étapes fonctionnelles chaînées
    df = load_data(csv_path)
    df = clean_data(df)
    df_numeric = extract_numeric_features(df)

    # Statistiques
    averages = average_all_columns(df) # Use original df to show averages for all numeric columns
    print("Moyennes des colonnes :")
    for col, val in averages.items():
        print(f"{col} : {val:.2f}")

    # Visualisations
    plot_correlation_heatmap(df) # Use original df for correlation heatmap
    plot_scatter(df) # Use original df for scatter plot

    # Clustering
    labels = perform_clustering(df) # Use original df for clustering after cleaning
    df['Cluster'] = labels

    # Sauvegarde du résultat
    df.to_csv("iris_with_clusters.csv", index=False)
    print("Pipeline terminé. Résultats enregistrés dans 'iris_with_clusters.csv'")


# -------------------------------
# Exécution
# -------------------------------
if __name__ == "__main__":
    iris_csv_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    iris_csv_filename = "Iris.csv"

    # Download the dataset before running the pipeline
    download_iris_dataset(iris_csv_url, iris_csv_filename)

    # Run the pipeline if the download was successful (check if file exists)
    import os
    if os.path.exists(iris_csv_filename):
        iris_pipeline(iris_csv_filename)
    else:
        print(f"Could not run the pipeline because {iris_csv_filename} was not downloaded.")

Successfully downloaded Iris.csv
Moyennes des colonnes :
5.1 : 5.85
3.5 : 3.05
1.4 : 3.77
0.2 : 1.21
Pipeline terminé. Résultats enregistrés dans 'iris_with_clusters.csv'


<Figure size 640x480 with 0 Axes>

<Figure size 1000x1000 with 0 Axes>

In [None]:
from google.colab import drive
drive.mount('/content/drive')