In [16]:
# datasets
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from random import randint
from sklearn.calibration import LabelEncoder
from common import get_full_data

def encode_categorical_columns(df):
    le = LabelEncoder()
    # Loop over all columns in the DataFrame
    for col in df.columns:
        # Check if the column is of type object (string)
        if df[col].dtype == 'object':
            # Use LabelEncoder to do the numeric transformation
            df[col] = le.fit_transform(df[col])
    return df

def heart_failure():
    df = pd.read_csv("dataset/heart_failure_clinical_records_dataset.csv")
    # get dependent and independent features
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def disease():
    df = pd.read_csv("dataset/disease.csv")
    # get dependent and independent features
    symptoms = df.iloc[:,1:]
    all_symptoms = set()
    for col in symptoms.columns:
        unq = symptoms[col].unique()
        unq=[str.strip(v) for v in unq if isinstance(v,str)]
        all_symptoms.update(unq)
    
    all_symptoms=list(all_symptoms)
    symptoms_embedding = pd.DataFrame(
        np.zeros((len(df),len(all_symptoms))),
        columns=all_symptoms
    )
    
    uniques = symptoms.apply(lambda x: np.unique(np.array(x.dropna(),dtype=str)),axis=1)
    for id,u in enumerate(uniques):
        u=[str.strip(v) for v in u if isinstance(v,str)]
        symptoms_embedding.iloc[id][u]=1

    y = df.iloc[:,0]
    classes = y.unique()
    
    for index,cls in enumerate(classes):
        y[y==cls]=index
    
    return symptoms_embedding, y.astype(int), classes

def AIDS():
    df = pd.read_csv("dataset/AIDS_Classification.csv")
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def seeds():
    df = pd.read_csv("dataset/seeds.csv")
    y = df.iloc[:,-1]-1
    classes = y.apply(lambda x: str(x)).unique()
    X = df.iloc[:,:-1]
    return X,y, classes

def housing():
    df = pd.read_csv("dataset/housing.csv")
    y=df['SaleCondition']
    bad_classes_names=['Alloca','AdjLand','Normal','Partial']
    classes = set(y.apply(lambda x: str(x)).unique())-set(bad_classes_names)

    X=df.drop(columns=['SaleCondition','SalePrice'])
    cols = X.columns
    bad_classes = np.any([y == cl for cl in bad_classes_names],axis=0)
    X=encode_categorical_columns(X)

    X=np.array(X)
    y=np.array(y)

    y=y[~bad_classes]
    X=X[~bad_classes]


    y_encoded = np.zeros_like(y,dtype=np.int32)
    for i,cls in enumerate(classes):
        y_encoded[y==cls]=i
    X[np.isnan(X)]=-1
    return pd.DataFrame(X,columns=cols),y_encoded,classes

In [17]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier

# choose which dataset to load
X,y, y_classes = seeds()

# for high-dimensional data use `gpu` for device if you have one
scaler = RobustScaler()
X_norm = scaler.fit_transform(X.to_numpy())

In [18]:
from sklearn.model_selection import train_test_split
from common import fit_KNN_model, fit_XGB_model

knn = fit_KNN_model(X,y,task="classification").best_estimator_
xgb = fit_XGB_model(X,y,task="classification").best_estimator_

# this model is used to search for outliers in a data
special_model = xgb
# this model is used to validate performance of a cleaned version of dataset
validate_model = knn

Searching KNN params...



The total space of parameters 72 is smaller than n_iter=150. Running 72 iterations. For exhaustive searches, use GridSearchCV.



Searching XGB params...


In [19]:
# render original data and model performance on it
from common import cross_val_classification_report, cross_val_score_mean_std, generate_colors_for_classification
from kernel_pca_search import KernelPCASearchCV, kernel_pca_scorer
from render import *
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, cross_val_score

setup="2D"

if setup=="3D":
    # 3d setup
    render_shuffle = [0,1,2,3,4,5]
    dot_size=5
    n_components=3
    axis_names = ['d1','d2','d3']
    plot_method = plot_3d_rgb
if setup=="2D":
    # 2d setup    
    render_shuffle = [0,1,2,3,4]
    dot_size=8
    n_components=2
    axis_names = ['d1','d2']
    plot_method = plot_2d_rgb

max_render = 10000
max_kpca_fit_values = 3000

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=50)
def render_results(X,y,model,scaler,outliers_mask,title="clean data"):
    X_clean = np.array(X)
    y_clean = np.array(y)
    
    if outliers_mask is not None:
        X_clean = X_clean[~outliers_mask]
        y_clean = y_clean[~outliers_mask]
    X_clean_small = pca.transform(scaler.transform(X_clean[:max_render]))
    colors = generate_colors_for_classification(y_clean[:max_render],seed=100)

    to_render=np.concatenate([X_clean_small,colors],axis=1)
    if outliers_mask is not None:
        removed_size = 1-len(y_clean)/len(y)
        print("removed ",removed_size)
        if removed_size==0:
            return
    
    report = cross_val_classification_report(
        model=model,
        X=X_clean,
        y=y_clean,
        cv=cv,
        target_names=y_classes
    )
    print(report)
    plot_method(
        to_render[:max_render,render_shuffle],
        title,
        axis_names, 
        template='plotly_dark',
        dot_size=dot_size)

indices = np.arange(len(X_norm))
np.random.shuffle(indices)
indices_small=indices[:max_kpca_fit_values]
pca = KernelPCASearchCV(n_components=n_components,n_iter=100,kernel='rbf')
pca = pca.fit(X_norm[indices_small]).kpca

render_results(X,y,validate_model,scaler,None,"original data")
print("Dim reduction quality",kernel_pca_scorer(pca,X_norm[indices][:max_render]))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       198
           1       0.94      0.95      0.94       204
           2       0.89      0.94      0.92       195

    accuracy                           0.90       597
   macro avg       0.90      0.90      0.90       597
weighted avg       0.90      0.90      0.90       597



Dim reduction quality 0.8924151623691854


In [20]:
# iterative filtering outliers search
from common import find_outliers

X_numpy = np.array(X)
y_numpy = np.array(y)

outliers_to_remove=0.2

outliers_mask, pred_loss, score = find_outliers(
    X_numpy,
    y_numpy,
    special_model,
    outliers_to_remove=outliers_to_remove,
    iterations=5,
    gamma=0.9,
    evaluate_loss=metrics.mean_absolute_error,
    cv=5,
    repeats=3,
    class_weight_scale_power=0.5,
    plot=False
)

outliers_to_remove=sum(outliers_mask)/len(X)

render_results(X,y,validate_model,scaler,outliers_mask,"iterative filtering")

removed  0.18592964824120606
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       138
           1       1.00      0.98      0.99       174
           2       0.98      0.99      0.99       174

    accuracy                           0.99       486
   macro avg       0.98      0.99      0.98       486
weighted avg       0.99      0.99      0.99       486



In [21]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=10,contamination=outliers_to_remove)
outliers_mask=clf.fit_predict(X)==-1
render_results(X,y,validate_model,scaler,outliers_mask,"isolation forest filtering")

removed  0.18592964824120606
              precision    recall  f1-score   support

           0       0.91      0.88      0.90       177
           1       0.94      0.97      0.96       138
           2       0.92      0.94      0.93       171

    accuracy                           0.93       486
   macro avg       0.93      0.93      0.93       486
weighted avg       0.93      0.93      0.93       486



In [22]:
from sklearn.cluster import DBSCAN

def get_outliers_dbscan(X, eps=0.5, min_samples=5):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    y_db = db.fit_predict(X)
    return y_db == -1
dbscan_outliers = get_outliers_dbscan(X)
if sum(~dbscan_outliers)!=0:
    render_results(X,y,validate_model,scaler,dbscan_outliers,"dbscan filtering")
else:
    print("dbscan failed, try different parameters")

removed  0.7185929648241206
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       1.00      1.00      1.00        39
           2       1.00      1.00      1.00        84

    accuracy                           1.00       168
   macro avg       1.00      1.00      1.00       168
weighted avg       1.00      1.00      1.00       168



In [23]:
from sklearn.svm import OneClassSVM

def outliers_svm(X):
    svm = OneClassSVM(nu=0.3, kernel="rbf", gamma=0.01)
    y_pred = svm.fit_predict(X)
    return y_pred == -1

svm_outliers = outliers_svm(X)
render_results(X,y,validate_model,scaler,svm_outliers,"one class svm filtering")

removed  0.3015075376884422
              precision    recall  f1-score   support

           0       0.92      0.86      0.89       183
           1       0.92      0.91      0.91       120
           2       0.88      0.98      0.93       114

    accuracy                           0.91       417
   macro avg       0.91      0.92      0.91       417
weighted avg       0.91      0.91      0.91       417

