# Cours PCD – Labo 2 : Détection de données aberrantes (outliers)

Dans le cadre de ce laboratoire il nous a été demandé de trouver les données atypiques (ou aberrantes) dans plusieurs jeux de données en appliquant la méthode LOF.  Ces jeux de données sont disponibles en ligne et comportent une annotation des données atypiques.  Cette annotation va nous permettre d’évaluer la méthode LOF en calculant son score F1, avec divers paramètres, sur chaque jeu.

In [308]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from itertools import product
from os import path
from scipy.io import arff
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import LocalOutlierFactor

## Chemins des données

In [309]:
data_folder = "data/literature/"

In [310]:
data_files = {
    "aloi": "ALOI/ALOI_withoutdupl_norm.arff",
    "glass": "Glass/Glass_withoutdupl_norm.arff",
    "ionosphere": "Ionosphere/Ionosphere_withoutdupl_norm.arff"
}

## Fonctions

In [311]:
def arff_to_dataframe(data_path):
    return pd.DataFrame(arff.loadarff(data_path)[0])

In [312]:
def outlier_report(df):
    df_outlier_counts = df["outlier"].value_counts()
    n_inliers  = df_outlier_counts[b"no"]
    n_outliers = df_outlier_counts[b"yes"]
    
    print(f"Nombre de valeurs : {len(df)}")
    print(f"Nombre d'inliers  : {n_inliers}")
    print(f"Nombre d'outliers : {n_outliers}")
    
    return n_outliers / len(df)

In [313]:
def lof(df, n_neighbors, contamination):
    clf = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    return clf.fit_predict(df)

In [314]:
def lof_metrics(df, col_gt="outlier", cols_ignore=["id"], n_neighbors=20, contamination=0.1, verbose=False):
    ground_truth = df[col_gt].map(lambda x: 1 if x == b"yes" else -1)
    
    cols_ignore.append(col_gt)
    pred = lof(df.drop(columns=cols_ignore), n_neighbors=n_neighbors, contamination=contamination)
    
    if verbose:
        print(f"Real outliers (Ground truth):\n{ground_truth}")
        print(f"Found outliers (Prediction):\n{pred}")
        
    return {
        "recall": recall_score(ground_truth, pred), 
        "precision": precision_score(ground_truth, pred),
        "f1": f1_score(ground_truth, pred)
    }

In [315]:
def lof_report(dataset, neighbors_counts, contaminations, verbosity=0):
    results = pd.DataFrame(columns=["name", "n_neighbors", "contamination", "recall", "precision", "f1"])
    for parameters in product(neighbors_counts, contamination_factors + [dataset["outlier_proportion"]]):
        n_neighbors = parameters[0]
        contamination = parameters[1]

        if verbosity > 0:
            print(f"""{dataset["name"]} : ({n_neighbors}, {contamination}) """)
            
        metrics = lof_metrics(dataset["dataset"], n_neighbors=n_neighbors, contamination=contamination, verbose=(verbosity>1))
        result = pd.DataFrame({
            "name": [dataset["name"],],
            "n_neighbors": [n_neighbors,],
            "contamination": [contamination,],
            "recall": [metrics["recall"],],
            "precision": [metrics["precision"],],
            "f1" : [metrics["f1"],]
        })
        results = pd.concat([results, result])
    return results.reset_index(drop=True)

## 1. Détermination du plus petit jeu de données

À première vue on constate que jeu de données "Glass" est le plus petit en termes de taille de fichier :

```shell
$ du -h data/literature/
20K	data/literature/Glass
19M	data/literature/ALOI
88K	data/literature/Ionosphere
19M	data/literature/
```

Cela est confirmé par le nombre de lignes dans chaque DataFrame chargé :

In [316]:
for (data_name, data_file) in data_files.items():
    print(f"{data_name:12}: {len(arff_to_dataframe(path.join(data_folder, data_file)))}")

aloi        : 49534
glass       : 214
ionosphere  : 351


## 2. Nature des données

### ALOI
ALOI est un ensemble de données qui représente une collection d'image. Ces données sont représentés par 50'000 instances dont 466 doublons (qui ont été enlevés ici) et comprennent 27 attributs numériques (histogrames HSB). 

In [317]:
aloi = arff_to_dataframe(path.join(data_folder, data_files["aloi"]))
aloi_outlier_proportion = outlier_report(aloi)
aloi

Nombre de valeurs : 49534
Nombre d'inliers  : 48026
Nombre d'outliers : 1508


Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att20,att21,att22,att23,att24,att25,att26,att27,outlier,id
0,0.784999,0.000018,0.000000,0.000093,0.000000,0.000000,0.0,0.000000,0.000000,0.139811,...,0.001189,0.0,0.115728,0.023449,0.000200,0.000220,0.034952,0.046914,b'yes',1.0
1,0.958088,0.000000,0.000000,0.001671,0.000000,0.000000,0.0,0.000000,0.000000,0.019556,...,0.000000,0.0,0.022035,0.007516,0.000000,0.000000,0.001033,0.000000,b'yes',2.0
2,0.938768,0.000000,0.000000,0.005146,0.000000,0.000000,0.0,0.000000,0.000000,0.018451,...,0.000000,0.0,0.035542,0.011982,0.000000,0.001595,0.019520,0.000000,b'yes',3.0
3,0.954775,0.000000,0.000000,0.001427,0.000000,0.000000,0.0,0.000000,0.000000,0.024944,...,0.000000,0.0,0.019941,0.000805,0.000000,0.000000,0.000035,0.000000,b'yes',4.0
4,0.933601,0.000000,0.000000,0.001682,0.000000,0.000000,0.0,0.000000,0.000000,0.037002,...,0.000000,0.0,0.046759,0.002663,0.000000,0.000339,0.001359,0.000000,b'yes',5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49529,0.945860,0.001296,0.000000,0.001334,0.001894,0.000021,0.0,0.000000,0.000000,0.002072,...,0.000000,0.0,0.000195,0.000805,0.000040,0.000174,0.024223,0.013462,b'no',49996.0
49530,0.956965,0.001921,0.000000,0.001856,0.002696,0.000064,0.0,0.000000,0.000000,0.002280,...,0.000000,0.0,0.000174,0.000387,0.000000,0.000193,0.024315,0.005990,b'no',49997.0
49531,0.966346,0.002729,0.000000,0.002170,0.002857,0.000107,0.0,0.000083,0.000000,0.001382,...,0.000000,0.0,0.000246,0.000709,0.000000,0.000211,0.018509,0.006266,b'no',49998.0
49532,0.968409,0.003372,0.000026,0.000731,0.002616,0.000384,0.0,0.000083,0.000078,0.000986,...,0.000000,0.0,0.000482,0.000752,0.000521,0.000266,0.017941,0.007500,b'no',49999.0


Cela correspond aux valeurs du site web: 48026 inliers et 1508 outliers

### Glass
Glass est un ensemble de données médico-légales décrivant les types de verre. Les auteurs utilisent la classe 6 (minoritaire) comme outlier et toutes les autres classes comme inliers. Malheureusement, ils ne détaillent pas les attributs qui ont été utilisés. L'ensemble de données d'origine se compose de 9 attributs, ici seuls 7 attributs sont présents. L'ensemble de données contient 214 instances, 9 valeurs aberrantes (4.21 %) et 205 inliers (95.79 %). Cet ensemble de données ne contient qu'un seul doublon, donc une version sans doublon n'a pas été jugée nécéssaire.

In [318]:
glass = arff_to_dataframe(path.join(data_folder, data_files["glass"]))
glass_outlier_proportion = outlier_report(glass)
glass

Nombre de valeurs : 214
Nombre d'inliers  : 205
Nombre d'outliers : 9


Unnamed: 0,var_0000,var_0001,var_0002,var_0003,var_0004,var_0005,var_0006,id,outlier
0,0.467651,0.321584,0.768880,0.246630,0.838799,0.099737,0.298340,1.0,b'no'
1,0.496412,0.220491,0.776032,0.316598,0.919973,0.089145,0.279479,2.0,b'no'
2,0.519133,0.404464,0.768012,0.334978,0.801622,0.092369,0.271238,3.0,b'no'
3,0.199650,0.547373,0.374284,0.362223,0.817017,0.000000,0.177913,4.0,b'yes'
4,0.847261,0.286361,0.000000,0.217792,0.000000,0.019135,1.000000,5.0,b'no'
...,...,...,...,...,...,...,...,...,...
209,0.495555,0.438735,0.749682,0.277775,0.791251,0.092286,0.260617,210.0,b'no'
210,0.515273,0.346053,0.777602,0.548135,0.863624,0.108131,0.230353,211.0,b'no'
211,0.459787,0.517479,0.773646,0.285545,0.761141,0.057688,0.223982,212.0,b'no'
212,0.442959,0.297568,0.631711,0.421653,0.884864,0.113589,0.276465,213.0,b'no'


Cela correspond aux valeurs du site web: 205 inliers et 9 outliers

### Ionosphere

Cet ensemble de données différencie les bons radars qui montrent des preuves d'une sorte de structure dans l'ionosphère, et les mauvais radars pour lesquels les signaux passent outre le radar. Dans cette version (HiCS, [1]), les auteurs utilisent la classe b (minoritaire) comme outliers et la classe g comme inliers. Ils ont supprimé les attributs 1 et 2 de leur jeu de données. Par conséquent, après le prétraitement, cet ensemble de données a 32 attributs numériques et 351 instances, 126 outliers (35.9 %) et 225 inliers (64.1 %). Cet ensemble de données ne contient qu'un seul doublon, donc une version sans doublon n'a pas été jugée nécéssaire.

In [319]:
ionosphere = arff_to_dataframe(path.join(data_folder, data_files["ionosphere"]))
ionosphere_outlier_proportion = outlier_report(ionosphere)
ionosphere

Nombre de valeurs : 351
Nombre d'inliers  : 225
Nombre d'outliers : 126


Unnamed: 0,var_0000,var_0001,var_0002,var_0003,var_0004,var_0005,var_0006,var_0007,var_0008,var_0009,...,var_0024,var_0025,var_0026,var_0027,var_0028,var_0029,var_0030,var_0031,id,outlier
0,0.997695,0.470555,0.926215,0.511530,0.916990,0.311460,1.000000,0.518800,0.926215,0.411225,...,0.705390,0.269160,0.606330,0.329550,0.711335,0.227565,0.593205,0.273500,1.0,b'no'
1,1.000000,0.405855,0.965175,0.319220,0.445660,0.032015,1.000000,0.477255,0.754370,0.161285,...,0.397660,0.407995,0.404800,0.442035,0.416870,0.468560,0.431310,0.487765,2.0,b'yes'
2,1.000000,0.483175,1.000000,0.502425,1.000000,0.439690,0.944825,0.505990,0.865410,0.526730,...,0.794920,0.389275,0.715500,0.413175,0.802180,0.379100,0.780225,0.308810,3.0,b'no'
3,1.000000,0.274195,1.000000,1.000000,0.856080,0.000000,0.500000,0.500000,0.500000,0.500000,...,0.758065,1.000000,1.000000,0.399505,0.628410,1.000000,0.338090,1.000000,4.0,b'yes'
4,1.000000,0.487995,0.970700,0.532655,0.960530,0.383725,0.885760,0.418005,0.763990,0.398625,...,0.566450,0.233970,0.512155,0.189015,0.471465,0.202135,0.476960,0.171515,5.0,b'no'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,0.917540,0.541490,0.868695,0.426470,0.921745,0.472165,0.952205,0.476890,0.946955,0.565650,...,0.917395,0.500615,1.000000,0.564075,0.933300,0.446430,0.952730,0.478465,347.0,b'no'
347,0.975565,0.502095,0.975915,0.486385,0.967190,0.490400,0.972950,0.508030,0.982550,0.516405,...,0.967610,0.524625,0.965795,0.540840,0.970330,0.499825,0.957415,0.523560,348.0,b'no'
348,0.973505,0.499830,0.966035,0.483865,0.975885,0.482845,0.977920,0.512230,0.970620,0.508830,...,0.962445,0.512710,0.960600,0.511210,0.962295,0.502210,0.963485,0.497115,349.0,b'no'
349,0.953040,0.491715,0.990610,0.490055,0.978455,0.481770,0.928730,0.500550,0.948620,0.483425,...,0.945735,0.461200,0.914915,0.413810,0.980110,0.481215,0.937015,0.418785,350.0,b'no'


Cela correspond aux valeurs du site web: 225 inliers et 126 outliers

In [320]:
datasets = [
    {"dataset": aloi, "name": "aloi", "outlier_proportion": aloi_outlier_proportion},
    {"dataset": glass, "name": "glass", "outlier_proportion": glass_outlier_proportion}, 
    {"dataset": ionosphere, "name": "ionosphere", "outlier_proportion": ionosphere_outlier_proportion}, 
]

## 3. Détection avec Local Outlier Factor (LOF)

In [321]:
neighbors_counts = [5, 10, 20, 40, 80]                # Valeurs de voisins à tester
contaminations = [0.00001, 0.01, 0.05, 0.1, 0.2, 0.4] # Valeurs de contamination à tester (on teste également la valeur réelle en dernier)

In [328]:
for dataset in datasets:
    results = lof_report(dataset, neighbors_counts, contaminations)
    print(f"""==== Meilleur F1-score ====\n{results.loc[results["f1"].idxmax()]}""")

==== Meilleur F1-score ====
name                 aloi
n_neighbors             5
contamination     0.00001
recall           0.999337
precision        0.030424
f1               0.059051
Name: 0, dtype: object
==== Meilleur F1-score ====
name                glass
n_neighbors            20
contamination        0.01
recall                1.0
precision        0.042654
f1               0.081818
Name: 15, dtype: object
==== Meilleur F1-score ====
name             ionosphere
n_neighbors               5
contamination       0.00001
recall             0.992063
precision          0.357143
f1                  0.52521
Name: 0, dtype: object
