In [1]:

!pip install timedelta



In [3]:
# ==============================================================================
# NOTEBOOK 1 FINAL (Version 2) : GÉNÉRATION D'UN DATASET COMPLET
# ==============================================================================
import pandas as pd
from sqlalchemy import create_engine
import pymysql
from datetime import date, datetime, timedelta


# --- 1. CONNEXION (inchangé) ---
db_user, db_password, db_host, db_port, db_name = 'root', 'root', 'localhost', '3306', 'sicda_easytime'
engine = create_engine(f"mysql+pymysql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")
print("Connexion BDD prête.")

# --- 2. CHARGEMENT (inchangé) ---
print("\n--- Chargement des données brutes ---")
df_absences = pd.read_sql("SELECT USER_FK as employe_id, DATE_DEBUT as date_debut, DATE_REPRISE as date_fin FROM absence WHERE STATUT = 'workflow_status_validated'", engine)
df_conges = pd.read_sql("SELECT USER_FK as employe_id, DATE_DEBUT as date_debut, DATE_REPRISE as date_fin FROM conge WHERE STATUT = 'workflow_status_validated'", engine)
df_employes = pd.read_sql("SELECT ID as employe_id, DATE_EMB as date_embauche, PROFIL_METIER_FK as profil_metier_id, NOEUD_FK as noeud_id FROM utilisateur", engine)
df_feries = pd.read_sql("SELECT DATE_DEBUT as date FROM jr_ferie", engine)
print("Données chargées.")

# --- 3. PRÉPARATION (inchangé) ---
print("\n--- Préparation et nettoyage ---")
for df in [df_absences, df_conges, df_feries, df_employes]:
    for col in df.columns:
        if 'date' in col: df[col] = pd.to_datetime(df[col], errors='coerce').dt.date

feries_set = set(df_feries['date'].dropna())
absences_dict = {}
all_absences = pd.concat([df_absences, df_conges])
for _, row in all_absences.iterrows():
    if row['employe_id'] not in absences_dict: absences_dict[row['employe_id']] = []
    absences_dict[row['employe_id']].append((row['date_debut'], row['date_fin']))
print("Données préparées.")

# --- 4. GÉNÉRATION DU DATASET (inchangé, mais propre) ---
print("\n--- Génération du dataset jour par jour ---")
start_date = date(2016, 9, 12)
end_date = date.today()
date_range = pd.date_range(start_date, end_date)
dataset_rows = []
print(f"Génération des données entre {start_date} et {end_date}...")

for _, employe in df_employes.iterrows():
    if pd.isnull(employe['date_embauche']): continue
    for single_date_dt in date_range:
        single_date = single_date_dt.date()
        if single_date < employe['date_embauche']: continue
        
        est_absent = 0
        if employe['employe_id'] in absences_dict:
            for debut, fin in absences_dict[employe['employe_id']]:
                if debut and fin and debut <= single_date <= fin:
                    est_absent = 1
                    break
        
        dataset_rows.append({
            'employe_id': employe['employe_id'], 'date': single_date,
            'jour_semaine': single_date.weekday(), 'jour_mois': single_date.day, 'mois': single_date.month,
            'semaine_annee': single_date.isocalendar()[1], 'profil_metier_id': employe['profil_metier_id'],
            'noeud_id': employe['noeud_id'], 'est_ferie': 1 if single_date in feries_set else 0,
            'veille_ferie': 1 if (single_date + timedelta(days=1)) in feries_set else 0,
            'lendemain_ferie': 1 if (single_date - timedelta(days=1)) in feries_set else 0,
            'est_absent': est_absent
        })

print(f"Dataset brut généré avec {len(dataset_rows)} lignes.")

# --- 5. FINALISATION (inchangé) ---
print("\n--- Création du DataFrame final ---")
final_dataset = pd.DataFrame(dataset_rows)
final_dataset = final_dataset[final_dataset['jour_semaine'] < 5]
final_dataset.fillna(0, inplace=True)
output_path = 'data/dataset_absences.csv'
final_dataset.to_csv(output_path, index=False)
print(f"Dataset final sauvegardé dans '{output_path}'.")
print("\nAnalyse de la Cible:")
print(final_dataset['est_absent'].value_counts(normalize=True))

Connexion BDD prête.

--- Chargement des données brutes ---
Données chargées.

--- Préparation et nettoyage ---
Données préparées.

--- Génération du dataset jour par jour ---
Génération des données entre 2016-09-12 et 2025-08-07...
Dataset brut généré avec 24691 lignes.

--- Création du DataFrame final ---
Dataset final sauvegardé dans 'data/dataset_absences.csv'.

Analyse de la Cible:
est_absent
0    0.999263
1    0.000737
Name: proportion, dtype: float64
