In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [44]:
df = pd.read_csv('data/base_data.csv', sep=';')

In [45]:
target = 'Reason for absence'

In [46]:
df.columns

Index(['ID', 'Reason for absence', 'Month of absence', 'Day of the week',
       'Seasons', 'Transportation expense', 'Distance from Residence to Work',
       'Service time', 'Age', 'Work load Average/day ', 'Hit target',
       'Disciplinary failure', 'Education', 'Son', 'Social drinker',
       'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index',
       'Absenteeism time in hours'],
      dtype='object')

In [47]:
df = df.rename(columns={target: 'target'})

In [48]:
df.isna().sum() # c'est propre

ID                                 0
target                             0
Month of absence                   0
Day of the week                    0
Seasons                            0
Transportation expense             0
Distance from Residence to Work    0
Service time                       0
Age                                0
Work load Average/day              0
Hit target                         0
Disciplinary failure               0
Education                          0
Son                                0
Social drinker                     0
Social smoker                      0
Pet                                0
Weight                             0
Height                             0
Body mass index                    0
Absenteeism time in hours          0
dtype: int64

In [49]:
df.head()

Unnamed: 0,ID,target,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,2


In [50]:
np.sort(df['target'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28])

In [51]:
# Define the ICD mapping with different grouping levels
# Define the ICD mapping into 6 broad groups
icd_mapping_5 = {

    0: "Unknown",

    # Group 1: Infectious, Neoplastic, and Immune Diseases
    1: "Infectious, Neoplastic, and Immune Diseases",
    2: "Infectious, Neoplastic, and Immune Diseases",
    3: "Infectious, Neoplastic, and Immune Diseases",

    # Group 2: Chronic and Metabolic Conditions
    4: "Chronic and Metabolic Conditions",
    9: "Chronic and Metabolic Conditions",
    10: "Chronic and Metabolic Conditions",
    11: "Chronic and Metabolic Conditions",

    # Group 3: Neurological, Psychiatric, and Sensory Disorders
    5: "Neurological, Psychiatric, and Sensory Disorders",
    6: "Neurological, Psychiatric, and Sensory Disorders",
    7: "Neurological, Psychiatric, and Sensory Disorders",
    8: "Neurological, Psychiatric, and Sensory Disorders",

    # Group 4: Musculoskeletal, Dermatological, and Genitourinary Conditions
    12: "Musculoskeletal, Dermatological, and Genitourinary Conditions",
    13: "Musculoskeletal, Dermatological, and Genitourinary Conditions",
    14: "Musculoskeletal, Dermatological, and Genitourinary Conditions",
    15: "Musculoskeletal, Dermatological, and Genitourinary Conditions",

    # Group 5: Injuries, External Causes, Pregnancy, and Other Conditions
    16: "Injuries, External Causes, Pregnancy, and Other Conditions",
    17: "Injuries, External Causes, Pregnancy, and Other Conditions",
    18: "Injuries, External Causes, Pregnancy, and Other Conditions",
    19: "Injuries, External Causes, Pregnancy, and Other Conditions",
    20: "Injuries, External Causes, Pregnancy, and Other Conditions",
    21: "Injuries, External Causes, Pregnancy, and Other Conditions",

    # Group 6: Non-Disease Absences (Administrative & Follow-up)
    22: "Non-Disease Absences",
    23: "Non-Disease Absences",
    24: "Non-Disease Absences",
    25: "Non-Disease Absences",
    26: "Non-Disease Absences",
    27: "Non-Disease Absences",
    28: "Non-Disease Absences"
}

# Apply the mapping to create a grouped category column
df["target"] = df["target"].map(icd_mapping_5)

In [52]:
y = df['target']
X = df.drop(columns=['target'])

In [53]:
y.value_counts()

target
Non-Disease Absences                                             435
Musculoskeletal, Dermatological, and Genitourinary Conditions     84
Injuries, External Causes, Pregnancy, and Other Conditions        71
Chronic and Metabolic Conditions                                  57
Unknown                                                           43
Neurological, Psychiatric, and Sensory Disorders                  32
Infectious, Neoplastic, and Immune Diseases                       18
Name: count, dtype: int64

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32, stratify=y)

In [55]:
df_train_user = pd.concat([X_train, y_train], axis=1)
df_test_user = pd.concat([X_test, y_test], axis=1)

In [56]:
df_train_user.to_csv('data/X_train.csv')
df_test_user.to_csv('data/X_test.csv')
# X_test.to_csv('data/X_test.csv')
# y_test.to_csv('data/y_test.csv')

In [57]:
# Baseline

model = RandomForestClassifier()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy on test set : ", acc)

Accuracy on test set :  0.6418918918918919
