In [23]:
import os

In [24]:
%pwd

'c:\\Users\\Lenovo\\Desktop\\stb_pfe_mlflow'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\Lenovo\\Desktop\\stb_pfe_mlflow'

In [25]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [26]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [27]:
from stb_pfe_mlflow.constants import *
from stb_pfe_mlflow.utils.common import read_yaml, create_directories

In [28]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [29]:
import os
from stb_pfe_mlflow import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [30]:
df= pd.read_csv("artifacts/data_cleaning/clean_data.csv")
df.head()

Unnamed: 0,tiers_key,ca,TOTMVTC,TOTMVTD,TOTMVTCnet,TOTMVTDnet,ENG,MontImp,encours,Encours_Moyen_Debiteur,...,Code_Classe,Code_Profession,Profession,Code_Activite_Economique,Activite_Economique,Code_secteur_activite,Secteur_Activite,Ville,Date_Ouverture,ancienneté
0,174,139474.073,0.0,-274.96,0.0,-261.17,0,0.0,0.0,1420.500876,...,1.0,P1099,AUTRES COMMERCANTS ET ASSIMILES,HAB99,TRANSPORTS FERROVIAIRES DE FRET,HA,TRANSPORTS ET ENTREPOSAGE,BAB BHAR,1958-04-05,66
1,370,139474.073,0.0,-483.78,0.0,-450.69,0,0.0,0.0,2311.483331,...,1.0,P1099,AUTRES COMMERCANTS ET ASSIMILES,GCG99,AUTRES COMMERCES DE DETAIL DE BIENS NEUFS EN M...,GB,COMMERCE; REPARATION D'AUTOMOBILES ET DE MOTOC...,TUNIS,1992-04-14,32
2,481,139474.073,3485166.04,-3473246.26,2971426.56,-3521324.16,0,0.0,0.0,1832.362192,...,0.0,P1099,AUTRES COMMERCANTS ET ASSIMILES,DAA04,COMMERCE D'ELECTRICITE,DA,"PRODUCTION ET DISTRIBUTION D'ELECTRICITE, DE G...",TUNIS BELVEDERE,2000-01-01,24
3,578,139474.073,0.0,-362.46,0.0,-341.15,0,0.0,0.0,1500.732704,...,0.0,P1099,AUTRES COMMERCANTS ET ASSIMILES,CWA01,FABRICATION DE MEUBLES DE BUREAU ET DE MAGASIN,CW,INDUSTRIE MANUFACTURIERE,TUNIS BELVEDERE,1992-05-07,32
4,633,3171.408,3224.32,-7610.75,3224.32,-7610.75,0,0.0,0.0,1832.362192,...,0.0,P1099,AUTRES COMMERCANTS ET ASSIMILES,GBG05,COMMERCE DE GROS DE QUINCAILLERIE ET FOURNITUR...,GB,COMMERCE; REPARATION D'AUTOMOBILES ET DE MOTOC...,REPUBLIQUE,1992-02-20,32


In [31]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
        
        
    def transforming_data(self):
        df.drop(columns=["Date_Ouverture"], inplace=True)
        # 2. Séparation des colonnes numériques et catégorielles
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        categorical_cols = df.select_dtypes(include=['object']).columns

        # Remplir les valeurs manquantes pour les colonnes numériques et catégorielles
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())  # Pour les colonnes numériques
        df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])  # Pour les colonnes catégorielles

        # 3. Encodage des variables catégorielles
        categorical_cols = ['ENG', 'Code_Profession', 'Profession', 'Code_Activite_Economique',
                            'Activite_Economique', 'Code_secteur_activite', 'Secteur_Activite', 'Ville']

        # Utilisation de OneHotEncoder pour les variables non-ordinales
        encoded_data = pd.get_dummies(df, columns=categorical_cols)

        # 4. Normalisation des variables numériques
        scaler = StandardScaler()
        encoded_data[numeric_cols] = scaler.fit_transform(encoded_data[numeric_cols])

        # 5. Détermination du nombre optimal de clusters avec la méthode Elbow
        inertia = []
        k_values = range(1, 10)

        for k in k_values:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(encoded_data)
            inertia.append(kmeans.inertia_)
            
        # 6. Clustering avec K-Means (choix de k optimal)
        k_optimal = 3  # Ajuster selon le résultat de l'Elbow ou Silhouette Score
        kmeans = KMeans(n_clusters=k_optimal, random_state=42)
        clusters = kmeans.fit_predict(encoded_data)
        df['Cluster'] = kmeans.labels_

        

        # Enregistrer le dataset final dans le répertoire configuré
        df.to_csv(os.path.join(self.config.root_dir, "transforming_data.csv"), index=False)

        # Log information
        logger.info("Data transformation complete")
        logger.info(f"Data shape: {df.shape}")

In [32]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transforming_data()
except Exception as e:
    raise e

[2024-10-01 01:15:11,935: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-01 01:15:11,937: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-01 01:15:11,941: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-01 01:15:11,943: INFO: common: created directory at: artifacts]
[2024-10-01 01:15:11,945: INFO: common: created directory at: artifacts/data_transformation]
[2024-10-01 01:15:54,064: INFO: 3562172996: Data transformation complete]
[2024-10-01 01:15:54,065: INFO: 3562172996: Data shape: (12946, 22)]
