In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

ot_odr_filename = os.path.join(".", "OT_ODR.csv.bz2")
ot_odr_df = pd.read_csv(ot_odr_filename, compression="bz2", sep=";")

equipements_filename = os.path.join(".", 'EQUIPEMENTS.csv')
equipements_df = pd.read_csv(equipements_filename, sep=";")

In [2]:
var_cat = [
    'ODR_LIBELLE',
    'TYPE_TRAVAIL',
    'SYSTEM_N1',
    'SYSTEM_N2',
    'SYSTEM_N3',
    'SIG_ORGANE',
    'SIG_CONTEXTE',
    'SIG_OBS',
    'LIGNE'
]
ot_odr_df.info()
for var in var_cat:
    ot_odr_df[var] = ot_odr_df[var].astype('category')

ot_odr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506558 entries, 0 to 506557
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   OT_ID          506558 non-null  object 
 1   ODR_ID         506558 non-null  object 
 2   ODR_LIBELLE    506558 non-null  object 
 3   TYPE_TRAVAIL   506558 non-null  object 
 4   DUREE_TRAVAIL  506558 non-null  float64
 5   SYSTEM_N1      506558 non-null  object 
 6   SYSTEM_N2      506558 non-null  object 
 7   SYSTEM_N3      506558 non-null  object 
 8   EQU_ID         506558 non-null  object 
 9   DATE_OT        506558 non-null  object 
 10  KILOMETRAGE    506557 non-null  float64
 11  SIG_ORGANE     506558 non-null  object 
 12  SIG_CONTEXTE   506558 non-null  object 
 13  SIG_OBS        506558 non-null  object 
 14  LIGNE          506558 non-null  object 
dtypes: float64(2), object(13)
memory usage: 58.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506558 entries, 0 to

In [None]:
#Répartition des données entre test et entrainement 
train_df, test_df = train_test_split(ot_odr_df, test_size=0.2, random_state=42)

In [3]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

In [4]:
var_to_model = [
    'SYSTEM_N2',
    'SIG_ORGANE',
    'SIG_OBS'
]
var_bn = {}
for var in var_to_model:
    nb_values = len(train_df[var].cat.categories)
    var_bn[var] = gum.LabelizedVariable(var, var, nb_values)

# renseigne les modalités des variables pour être fidèle à celles des données.
for var in var_bn:
    for i, modalite in enumerate(ot_odr_df[var].cat.categories):
        var_bn[var].changeLabel(i, modalite)

# création du réseau bayésien et ajout des variables sélectionnées
bn_etud = gum.BayesNet("modèle simple")

for var in var_bn.values():
    bn_etud.add(var)

# Création de la structure
bn_etud.addArc("SYSTEM_N2", "SIG_OBS")
bn_etud.addArc("SYSTEM_N2", "SIG_ORGANE")


bn_etud