In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from random import randint

In [None]:
from sklearn.calibration import LabelEncoder
from common import get_full_data
def heart_failure():
    df = pd.read_csv("dataset/heart_failure_clinical_records_dataset.csv")
    # get dependent and independent features
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def encode_categorical_columns(df):
    le = LabelEncoder()
    # Loop over all columns in the DataFrame
    for col in df.columns:
        # Check if the column is of type object (string)
        if df[col].dtype == 'object':
            # Use LabelEncoder to do the numeric transformation
            df[col] = le.fit_transform(df[col])
    return df

def disease():
    df = pd.read_csv("dataset/disease.csv")
    # get dependent and independent features
    classes = df.iloc[:,0].unique()
    df = encode_categorical_columns(df)

    X = df.iloc[:,1:]
    y = df.iloc[:,0]
    return X,y, classes

def seeds():
    df = pd.read_csv("dataset/seeds.csv")
    y = df.iloc[:,-1]-1
    classes = y.apply(lambda x: str(x)).unique()
    X = df.iloc[:,:-1]
    return X,y, classes


In [None]:
from xgboost import XGBRegressor,  XGBClassifier
X,y, y_classes = heart_failure()

# for high-dimensional data use `gpu` for device if you have one
special_model = XGBClassifier(device='cpu')

In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from common import XGB_search_params

params = XGB_search_params()
state = randint(0,1000)
search = RandomizedSearchCV(
    special_model,
    params,
    n_iter=200,
    cv=5,
    random_state=state,
    n_jobs=-1,
)

search.fit(X,y)
special_model=search.best_estimator_

In [None]:
# do repeated stratified k-fold cross-validation with classification report
from sklearn.model_selection import RepeatedStratifiedKFold
from common import cross_val_classification_report

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=50)
report = cross_val_classification_report(
    model=special_model,
    X=X.to_numpy(),
    y=y,
    cv=cv,
    target_names=y_classes
)
print(report)

In [None]:
# New method
from common import find_outliers

X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

outliers_mask, score = find_outliers(
    X_numpy,
    y_numpy,
    special_model,
    outliers_to_remove=0.1,
    iterations=5,
    gamma=0.5,
    evaluate_loss=metrics.mean_squared_error,
    repeats=5,
    cv=10,
    plot=True
)

X_clean = X_numpy[~outliers_mask]
y_clean = y_numpy[~outliers_mask]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=50)
report = cross_val_classification_report(
    model=special_model,
    X=X_clean,
    y=y_clean,
    cv=cv,
    target_names=y_classes
)
print(report)


In [None]:
from common import generate_colors_for_classification
from sklearn.preprocessing import StandardScaler, RobustScaler
from kernel_pca_search import KernelPCASearchCV
from render import *

kpca_cv = KernelPCASearchCV(n_components=3,scaler=RobustScaler(),cv=5,n_iter=-1)
x_transform = kpca_cv.fit_transform(X)

colors = generate_colors_for_classification(y,seed=100)
data = np.concatenate([x_transform,colors],axis=1)
plot_3d_rgb(data,"Original data plot",["d1","d2","d3"])

x_clean_transform = kpca_cv.transform(X_clean)
y_clean_colors = generate_colors_for_classification(y_clean,seed=100)
data_clean = np.concatenate([x_clean_transform,y_clean_colors],axis=1)
plot_3d_rgb(data_clean,"Cleaned data plot",["d1","d2","d3"])

print(kpca_cv.score)
print(kpca_cv.kpca)

Compare performance with some control model `SVC` with `rbf` kernel and standard scaler

Find optimal hyperparameters using `RandomSearchCV`