In [15]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from random import randint
from sklearn.calibration import LabelEncoder
from common import get_full_data

def heart_failure():
    df = pd.read_csv("dataset/heart_failure_clinical_records_dataset.csv")
    # get dependent and independent features
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def encode_categorical_columns(df):
    le = LabelEncoder()
    # Loop over all columns in the DataFrame
    for col in df.columns:
        # Check if the column is of type object (string)
        if df[col].dtype == 'object':
            # Use LabelEncoder to do the numeric transformation
            df[col] = le.fit_transform(df[col])
    return df

def disease():
    df = pd.read_csv("dataset/disease.csv")
    # get dependent and independent features
    symptoms = df.iloc[:,1:]
    all_symptoms = set()
    for col in symptoms.columns:
        unq = symptoms[col].unique()
        unq=[str.strip(v) for v in unq if isinstance(v,str)]
        all_symptoms.update(unq)
    
    all_symptoms=list(all_symptoms)
    symptoms_embedding = pd.DataFrame(
        np.zeros((len(df),len(all_symptoms))),
        columns=all_symptoms
    )
    
    uniques = symptoms.apply(lambda x: np.unique(np.array(x.dropna(),dtype=str)),axis=1)
    for id,u in enumerate(uniques):
        u=[str.strip(v) for v in u if isinstance(v,str)]
        symptoms_embedding.iloc[id][u]=1

    y = df.iloc[:,0]
    classes = y.unique()
    
    for index,cls in enumerate(classes):
        y[y==cls]=index
    
    return symptoms_embedding, y.astype(int), classes

def AIDS():
    df = pd.read_csv("dataset/AIDS_Classification.csv")
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def seeds():
    df = pd.read_csv("dataset/seeds.csv")
    y = df.iloc[:,-1]-1
    classes = y.apply(lambda x: str(x)).unique()
    X = df.iloc[:,:-1]
    return X,y, classes

def housing():
    df = pd.read_csv("dataset/housing.csv")
    y=df['SaleCondition']
    bad_classes_names=['Alloca','AdjLand','Normal','Partial']
    classes = set(y.apply(lambda x: str(x)).unique())-set(bad_classes_names)

    X=df.drop(columns=['SaleCondition','SalePrice'])
    bad_classes = np.any([y == cl for cl in bad_classes_names],axis=0)
    X=encode_categorical_columns(X)

    X=np.array(X)
    y=np.array(y)

    y=y[~bad_classes]
    X=X[~bad_classes]


    y_encoded = np.zeros_like(y,dtype=np.int32)
    for i,cls in enumerate(classes):
        y_encoded[y==cls]=i

    return X,y_encoded,classes

In [16]:
from xgboost import XGBClassifier
X,y, y_classes = seeds()

# for high-dimensional data use `gpu` for device if you have one
special_model = XGBClassifier(device='cpu')

In [17]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from common import XGB_search_params

params = XGB_search_params()
state = randint(0,1000)
search = RandomizedSearchCV(
    special_model,
    params,
    n_iter=200,
    cv=5,
    random_state=state,
    n_jobs=-1,
)

search.fit(X,y)
special_model=search.best_estimator_

In [18]:
# do repeated stratified k-fold cross-validation with classification report
from sklearn.model_selection import RepeatedStratifiedKFold
from common import cross_val_classification_report

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=50)
report = cross_val_classification_report(
    model=special_model,
    X=np.array(X),
    y=y,
    cv=cv,
    target_names=y_classes
)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       198
           1       0.98      0.98      0.98       204
           2       0.93      0.95      0.94       195

    accuracy                           0.95       597
   macro avg       0.95      0.95      0.95       597
weighted avg       0.95      0.95      0.95       597



In [79]:
# New method
from common import find_outliers

X_numpy = np.array(X)
y_numpy = np.array(y)

outliers_to_remove=0.3

outliers_mask, score = find_outliers(
    X_numpy,
    y_numpy,
    special_model,
    outliers_to_remove=outliers_to_remove,
    iterations=5,
    gamma=0.9,
    evaluate_loss=metrics.mean_absolute_error,
    cv=5,
    repeats=5,
    class_weight_scale_power=0.5,
    plot=False
)

X_clean = X_numpy[~outliers_mask]
y_clean = y_numpy[~outliers_mask]
removed_size = np.count_nonzero(outliers_mask)/len(outliers_mask)

print("removed ",removed_size)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=50)
report = cross_val_classification_report(
    model=special_model,
    X=X_clean,
    y=y_clean,
    cv=cv,
    target_names=y_classes
)
print(report)


removed  0.135678391959799
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       147
           1       1.00      0.98      0.99       195
           2       0.96      0.98      0.97       174

    accuracy                           0.97       516
   macro avg       0.97      0.97      0.97       516
weighted avg       0.97      0.97      0.97       516



In [84]:
from common import generate_colors_for_classification
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import KernelPCA as KPCA
from kernel_pca_search import kernel_pca_scorer
from render import *

scaler = RobustScaler()
dot_size = 8
X_numpy[np.isnan(X_numpy)]=-1
X_ = scaler.fit_transform(X_numpy)

transform = KPCA(n_components=2,kernel='rbf',fit_inverse_transform=True)
transform.fit(X_)

def render(model,X_clean,y_clean,title="data plot"):
    colors = generate_colors_for_classification(y_clean,seed=100)
    data = np.concatenate([X_clean,colors],axis=1)
    removed_size = 1-len(y_clean)/len(X)

    print("removed ",removed_size)

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=50)
    report = cross_val_classification_report(
        model=model,
        X=X_clean,
        y=y_clean,
        cv=cv,
        target_names=y_classes
    )
    print(report)
    plot_2d_rgb(data,title,["d1","d2","d3"],dot_size,None)

render(special_model,transform.transform(X_),y,"original data")

render(special_model,transform.transform(X_)[~outliers_mask],y.to_numpy()[~outliers_mask],"cleaned data")
# print(kernel_pca_scorer(transform,X_))

removed  0.0
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       198
           1       0.95      0.93      0.94       204
           2       0.92      0.88      0.90       195

    accuracy                           0.90       597
   macro avg       0.90      0.90      0.90       597
weighted avg       0.90      0.90      0.90       597



removed  0.13567839195979903
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       147
           1       0.97      0.98      0.98       195
           2       0.99      1.00      0.99       174

    accuracy                           0.98       516
   macro avg       0.98      0.98      0.98       516
weighted avg       0.98      0.98      0.98       516



In [35]:
data = np.concatenate([X_,y.to_numpy()[:,np.newaxis]],axis=1)

In [80]:
from sklearn.svm import OneClassSVM

def remove_outliers_svm(X):
    svm = OneClassSVM(nu=removed_size, kernel="rbf", gamma=0.2)
    y_pred = svm.fit_predict(X)
    return X[y_pred == 1]  # Retain only inliers

svm_data_clean = remove_outliers_svm(data)
X_clean=svm_data_clean[:,:-1]
y_clean=svm_data_clean[:,-1]

render(special_model,transform.transform(X_clean),y_clean,"svm one class")

removed  0.14572864321608037
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       174
           1       0.95      0.92      0.94       177
           2       0.94      0.91      0.93       159

    accuracy                           0.91       510
   macro avg       0.91      0.91      0.91       510
weighted avg       0.91      0.91      0.91       510



In [83]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=50,contamination=removed_size)
outliers_pred=clf.fit_predict(data)

data_clean = data[outliers_pred==1]

X_clean=data_clean[:,:-1]
y_clean=data_clean[:,-1]

render(special_model,transform.transform(X_clean),y_clean,"isolation forest")

removed  0.13567839195979903
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       174
           1       0.96      0.94      0.95       165
           2       0.93      0.92      0.92       177

    accuracy                           0.91       516
   macro avg       0.91      0.91      0.91       516
weighted avg       0.91      0.91      0.91       516

