In [21]:
import pandas as pd
data_set = pd.read_csv('employee.csv', encoding='ISO-8859-1')
# travailler sur une copie pour faire des comparaisons
df = data_set.copy()


In [22]:
import numpy as np
# Calcule le nombre de valeurs manquantes selon chaque feature
# fonction old-school, existe certainement une fonction chez pandas...
def compute_missing_values(df):
    list_missing_val = {}
    col = df.columns
    for i, k in enumerate(col):
        list_missing_val[k] = df[k].isna().sum()
    return list_missing_val

In [23]:
def show_box_plots(df, col):
    fig, axs = plt.subplots(2, 5, figsize=(35, 35))
    axs = axs.flatten()
    for i, col in enumerate(col):
        sns.boxplot(y=df[col], data=df, ax=axs[i])
    
    fig.tight_layout(rect=[0, 0.03, 1, 0.98])
    fig.delaxes(axs[-1])
    fig.delaxes(axs[-2])
    plt.show()

In [24]:
#
# Q2 voici la taille du dataframe:
print(df.shape)
# Q2 voici les valeurs manquantes selon les colonnes:
list = compute_missing_values(data_set)
print("missing values")
for i in list:
    print(f"{i}-->{list[i]:d}")

(22552, 12)
missing values
NAME-->6
DEPARTMENT_NAME-->6
TITLE-->6
REGULAR-->644
RETRO-->22150
OTHER-->8423
OVERTIME-->15706
INJURED-->21096
DETAIL-->20493
QUINN_EDUCATION_INCENTIVE-->21166
TOTAL_GROSS-->6
POSTAL-->6


In [25]:

# compte le nombre de tuples vides (toutes les features à nan)
# df[df.isna().all(axis=1)].shape[0]

# suppression des tuples vides (toutes les colonnes à NaN)
df = df.dropna(how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22546 entries, 0 to 22545
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   NAME                       22546 non-null  object
 1   DEPARTMENT_NAME            22546 non-null  object
 2   TITLE                      22546 non-null  object
 3   REGULAR                    21908 non-null  object
 4   RETRO                      402 non-null    object
 5   OTHER                      14129 non-null  object
 6   OVERTIME                   6846 non-null   object
 7   INJURED                    1456 non-null   object
 8   DETAIL                     2059 non-null   object
 9   QUINN_EDUCATION_INCENTIVE  1386 non-null   object
 10  TOTAL_GROSS                22546 non-null  object
 11  POSTAL                     22546 non-null  object
dtypes: object(12)
memory usage: 2.2+ MB


In [26]:
col = ['REGULAR', 'RETRO', 'OTHER', 'OVERTIME', 'INJURED', 'DETAIL', 
       'QUINN_EDUCATION_INCENTIVE', 'TOTAL_GROSS', 'POSTAL']
# on remplace les , ' ou tout les trucs qui perturbent la convertion en float python.
df[col] = df[col].replace({',': '', "'": ''}, regex=True)
# on applique la converton en float
df[col] = df[col].apply(pd.to_numeric, errors='coerce')

In [27]:
import seaborn as sns
import matplotlib.pyplot as plt
# on afficher les boxplots des features numériques pour observer à quoi cela ressemble
col = ['REGULAR', 'RETRO', 'OTHER', 'OVERTIME', 'INJURED', 'DETAIL', 'QUINN_EDUCATION_INCENTIVE', 'TOTAL_GROSS']

# show_box_plots(df, col)

In [28]:
# Nous séléctionnons les tuples comprenant les 5 valeurs max sur le total_gross
# nous avons 2 tuples que nous supprimons.
top = df.nlargest(n=2, columns=['TOTAL_GROSS']).index
# Supprimer les lignes correspondant aux index obtenus
df = df.drop(top)
#show_box_plots(df, col)

In [29]:
col = ['REGULAR', 'RETRO', 'OTHER', 'OVERTIME', 'INJURED', 'DETAIL', 'QUINN_EDUCATION_INCENTIVE', 'TOTAL_GROSS']
target = df['DEPARTMENT_NAME']
df = df[col]

In [None]:
target.infos()

In [41]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.1, random_state=42)

# Imputation des valeurs manquantes
imputers = {
    "Zero": SimpleImputer(strategy="constant", fill_value=0),
    "Median": SimpleImputer(strategy="median"),
    "KNN": KNNImputer(),
    "Iterative": IterativeImputer()
}

imputed_data_train = {}
imputed_data_test = {}

for imputer_name, imputer in imputers.items():
    # Imputer sur les données d'entraînement
    X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
    imputed_data_train[imputer_name] = X_train_imputed
    
    # Imputer sur les données de test
    X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
    imputed_data_test[imputer_name] = X_test_imputed

classifiers = {
    "KNN         ": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier()
}

for imputer_name in imputers.keys():
    print(f"Imputer: {imputer_name}")
    
    for clf_name, clf in classifiers.items():
        clf.fit(imputed_data_train[imputer_name], y_train)
        
        y_pred = clf.predict(imputed_data_test[imputer_name])
        
        f1_micro = f1_score(y_test, y_pred, average='micro')
        f1_macro = f1_score(y_test, y_pred, average='macro')
        accu = accuracy_score(y_test, y_pred)
        
        print(f"Classifier: {clf_name}, F1 (micro): {f1_micro:.3f}, F1 (macro): {f1_macro:.3f}, accuracy: {accu:.3f}")




dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)

accu_dummy = accuracy_score(y_test, y_pred_dummy)
f1_dummy_mi = f1_score(y_test, y_pred_dummy, average='micro')
f1_dummy_ma = f1_score(y_test, y_pred_dummy, average='macro')
print(f"Dummy time -> f1 micro: {f1_dummy_mi:.3f}, f1 macro: {f1_dummy_ma:.3f}, accuracy: {accu_dummy:.3f}")


Imputer: Zero
Classifier: KNN         , F1 (micro): 0.231, F1 (macro): 0.050, accuracy: 0.231
Classifier: DecisionTree, F1 (micro): 0.258, F1 (macro): 0.062, accuracy: 0.258
Classifier: RandomForest, F1 (micro): 0.278, F1 (macro): 0.073, accuracy: 0.278
Imputer: Median
Classifier: KNN         , F1 (micro): 0.234, F1 (macro): 0.051, accuracy: 0.234
Classifier: DecisionTree, F1 (micro): 0.263, F1 (macro): 0.066, accuracy: 0.263
Classifier: RandomForest, F1 (micro): 0.279, F1 (macro): 0.074, accuracy: 0.279
Imputer: KNN
Classifier: KNN         , F1 (micro): 0.228, F1 (macro): 0.051, accuracy: 0.228
Classifier: DecisionTree, F1 (micro): 0.241, F1 (macro): 0.060, accuracy: 0.241
Classifier: RandomForest, F1 (micro): 0.267, F1 (macro): 0.064, accuracy: 0.267
Imputer: Iterative
Classifier: KNN         , F1 (micro): 0.238, F1 (macro): 0.051, accuracy: 0.238
Classifier: DecisionTree, F1 (micro): 0.255, F1 (macro): 0.072, accuracy: 0.255
Classifier: RandomForest, F1 (micro): 0.270, F1 (macro): 0

Median + RandomForest = meilleure micro-moyenne
accuracy est similaire à la micro moyenne
    voir -> https://stackoverflow.com/questions/62792001/precision-and-recall-are-the-same-within-a-model/62792607#comment126263935_62792607

# Rapport

## Q6
Selon doc scikit learn -> incompatible avec les classificateurs
Selon GTP -> oui 




## Q7:

1. 
2. 
3. 
4. 
5. 