In [1]:
#Document qui vise à explorer le dataset de la base de données de KBO

#Importation des librairies
import os
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Chemin relatif vers le dossier de données
data_dir = os.path.join('..', '..', 'data')

# Chemins complets vers chaque fichier CSV
activity_path = os.path.join(data_dir, 'activity.csv')
address_path = os.path.join(data_dir, 'address.csv')
branch_path = os.path.join(data_dir, 'branch.csv')
code_path = os.path.join(data_dir, 'code.csv')
contact_path = os.path.join(data_dir, 'contact.csv')
denomination_path = os.path.join(data_dir, 'denomination.csv')
enterprise_path = os.path.join(data_dir, 'enterprise.csv')
establishment_path = os.path.join(data_dir, 'establishment.csv')
meta_path = os.path.join(data_dir, 'meta.csv')

In [None]:
# Charger les datasets avec Dask sans spécifier de types de données
activity_df = pd.read_csv(activity_path)
address_df = pd.read_csv(address_path)
branch_df = pd.read_csv(branch_path)
code_df = pd.read_csv(code_path)
contact_df = pd.read_csv(contact_path)
denomination_df = pd.read_csv(denomination_path)
enterprise_df = pd.read_csv(enterprise_path)
establishment_df = pd.read_csv(establishment_path)
meta_df = pd.read_csv(meta_path)

In [None]:
print(activity_df.columns)
print(address_df.columns)
print(branch_df.columns)
print(code_df.columns)
print(contact_df.columns)
print(denomination_df.columns)
print(enterprise_df.columns)
print(establishment_df.columns)
print(meta_df.columns)

In [None]:
# Convertir les colonnes avec des types mixtes
activity_df['EntityNumber'] = activity_df['EntityNumber'].astype(str)
activity_df['Classification'] = activity_df['Classification'].astype(str)
address_df['EntityNumber'] = address_df['EntityNumber'].astype(str)
address_df['Zipcode'] = pd.to_numeric(address_df['Zipcode'], errors='coerce')
address_df['HouseNumber'] = address_df['HouseNumber'].astype(str)
address_df['DateStrikingOff'] = pd.to_datetime(address_df['DateStrikingOff'], errors='coerce')
branch_df['Id'] = branch_df['Id'].astype(str)
branch_df['StartDate'] = pd.to_datetime(branch_df['StartDate'], errors='coerce')
branch_df['EnterpriseNumber'] = branch_df['EnterpriseNumber'].astype(str)
code_df['Category'] = code_df['Category'].astype(str)
code_df['Code'] = code_df['Code'].astype(str)
code_df['Language'] = code_df['Language'].astype(str)
code_df['Description'] = code_df['Description'].astype(str)
contact_df['EntityNumber'] = contact_df['EntityNumber'].astype(str)
contact_df['EntityContact'] = contact_df['EntityContact'].astype(str)
contact_df['ContactType'] = contact_df['ContactType'].astype(str)
contact_df['Value'] = contact_df['Value'].astype(str)
denomination_df['EntityNumber'] = denomination_df['EntityNumber'].astype(str)
denomination_df['Denomination'] = denomination_df['Denomination'].astype(str)
enterprise_df['EnterpriseNumber'] = enterprise_df['EnterpriseNumber'].astype(str)
enterprise_df['JuridicalForm'] = pd.to_numeric(enterprise_df['JuridicalForm'], errors='coerce')
enterprise_df['JuridicalFormCAC'] = pd.to_numeric(enterprise_df['JuridicalFormCAC'], errors='coerce')
enterprise_df['StartDate'] = pd.to_datetime(enterprise_df['StartDate'], errors='coerce')
establishment_df['EstablishmentNumber'] = establishment_df['EstablishmentNumber'].astype(str)
establishment_df['StartDate'] = pd.to_datetime(establishment_df['StartDate'], errors='coerce')
establishment_df['EnterpriseNumber'] = establishment_df['EnterpriseNumber'].astype(str)
meta_df['Variable'] = meta_df['Variable'].astype(str)
meta_df['Value'] = meta_df['Value'].astype(str)

In [None]:
# Échantillonner les données (par exemple, 10% des données)
sample_fraction = 0.1

activity_df = activity_df.sample(frac=sample_fraction, random_state=42)
address_df = address_df.sample(frac=sample_fraction, random_state=42)
branch_df = branch_df.sample(frac=sample_fraction, random_state=42)
code_df = code_df.sample(frac=sample_fraction, random_state=42)
contact_df = contact_df.sample(frac=sample_fraction, random_state=42)
denomination_df = denomination_df.sample(frac=sample_fraction, random_state=42)
enterprise_df = enterprise_df.sample(frac=sample_fraction, random_state=42)
establishment_df = establishment_df.sample(frac=sample_fraction, random_state=42)

In [None]:
# Fusionner les datasets en utilisant EntityNumber comme clé primaire
merged_df = activity_df.merge(enterprise_df, left_on='EntityNumber', right_on='EnterpriseNumber', how='left')
merged_df = merged_df.merge(address_df, on='EntityNumber', how='left')
merged_df = merged_df.merge(branch_df, left_on='EntityNumber', right_on='EnterpriseNumber', how='left')
merged_df = merged_df.merge(contact_df, on='EntityNumber', how='left')
merged_df = merged_df.merge(denomination_df, on='EntityNumber', how='left')
merged_df = merged_df.merge(establishment_df, left_on='EntityNumber', right_on='EnterpriseNumber', how='left')




In [None]:
print(merged_df.describe())


In [None]:
# Supprimer les colonnes constantes
merged_df = merged_df.drop(columns=[
    'Status', 'EnterpriseNumber_x', 'EnterpriseNumber_y', 'Id', 'EntityContact', 'ContactType',
    'Language', 'TypeOfDenomination', 'Denomination', 'EstablishmentNumber', 'EnterpriseNumber'
])

In [None]:
# Remplacer les valeurs manquantes pour les colonnes numériques et non numériques
def fill_missing_values(df):
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].mean())
        else:
            df[col] = df[col].fillna(df[col].mode().iloc[0])
    return df

# Appliquer la fonction pour remplacer les valeurs manquantes
merged_df = fill_missing_values(merged_df)

In [None]:
# Encodage des colonnes catégorielles restantes
categorical_columns = ['TypeOfAddress', 'CountryNL', 'CountryFR', 'MunicipalityNL', 'MunicipalityFR',
                       'StreetNL', 'StreetFR', 'Box', 'ExtraAddressInfo']


for col in categorical_columns:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col].astype(str))

In [None]:
# Retirer les colonnes datetime
datetime_cols = merged_df.select_dtypes(include=['datetime']).columns
merged_df = merged_df.drop(columns=datetime_cols)

In [None]:
# Préparer les caractéristiques et la cible
features = merged_df.drop(columns=['Classification'])
target = merged_df['Classification'].map(lambda x: 1 if x == 'MAIN' else 0)

In [None]:
# Échantillonner les données (par exemple, 10% des données)
sample_fraction = 0.1
X_sampled, _, y_sampled, _ = train_test_split(features, target, test_size=1 - sample_fraction, random_state=42)

In [None]:
# Diviser les données échantillonnées en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

In [None]:
# Entraîner un modèle de classification (CatBoost)
model = CatBoostClassifier(verbose=0)
model.fit(X_train, y_train)

In [None]:
# Prédire sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluer le modèle
print(classification_report(y_test, y_pred))

# Évaluer l'importance des caractéristiques
importances = model.feature_importances_
feature_names = features.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
print(feature_importances)

In [None]:
joblib.dump(model, 'model.pkl')  # Sauvegarder le modèle