In [152]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from random import randint
from sklearn.calibration import LabelEncoder
from common import get_full_data

def heart_failure():
    df = pd.read_csv("dataset/heart_failure_clinical_records_dataset.csv")
    # get dependent and independent features
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def encode_categorical_columns(df):
    le = LabelEncoder()
    # Loop over all columns in the DataFrame
    for col in df.columns:
        # Check if the column is of type object (string)
        if df[col].dtype == 'object':
            # Use LabelEncoder to do the numeric transformation
            df[col] = le.fit_transform(df[col])
    return df

def disease():
    df = pd.read_csv("dataset/disease.csv")
    # get dependent and independent features
    symptoms = df.iloc[:,1:]
    all_symptoms = set()
    for col in symptoms.columns:
        unq = symptoms[col].unique()
        unq=[str.strip(v) for v in unq if isinstance(v,str)]
        all_symptoms.update(unq)
    
    all_symptoms=list(all_symptoms)
    symptoms_embedding = pd.DataFrame(
        np.zeros((len(df),len(all_symptoms))),
        columns=all_symptoms
    )
    
    uniques = symptoms.apply(lambda x: np.unique(np.array(x.dropna(),dtype=str)),axis=1)
    for id,u in enumerate(uniques):
        u=[str.strip(v) for v in u if isinstance(v,str)]
        symptoms_embedding.iloc[id][u]=1

    y = df.iloc[:,0]
    classes = y.unique()
    
    for index,cls in enumerate(classes):
        y[y==cls]=index
    
    return symptoms_embedding, y.astype(int), classes

def AIDS():
    df = pd.read_csv("dataset/AIDS_Classification.csv")
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    classes = y.unique()
    X,y=get_full_data(X,y)
    return X,y,[str(c) for c in classes]

def seeds():
    df = pd.read_csv("dataset/seeds.csv")
    y = df.iloc[:,-1]-1
    classes = y.apply(lambda x: str(x)).unique()
    X = df.iloc[:,:-1]
    return X,y, classes


In [153]:
from xgboost import XGBClassifier
X,y, y_classes = disease()

# for high-dimensional data use `gpu` for device if you have one
special_model = XGBClassifier(device='cpu')

In [155]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from common import XGB_search_params

params = XGB_search_params()
state = randint(0,1000)
search = RandomizedSearchCV(
    special_model,
    params,
    n_iter=200,
    cv=5,
    random_state=state,
    n_jobs=-1,
)

search.fit(X,y)
special_model=search.best_estimator_

7120.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
7120.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
7121.05s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
7121.24s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
7121.45s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=of

KeyboardInterrupt: 

In [156]:
# do repeated stratified k-fold cross-validation with classification report
from sklearn.model_selection import RepeatedStratifiedKFold
from common import cross_val_classification_report

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=50)
report = cross_val_classification_report(
    model=XGBClassifier(),
    X=X.to_numpy(),
    y=y,
    cv=cv,
    target_names=y_classes
)
print(report)

                                         precision    recall  f1-score   support

                       Fungal infection       1.00      1.00      1.00       360
                                Allergy       1.00      1.00      1.00       360
                                   GERD       1.00      1.00      1.00       360
                    Chronic cholestasis       1.00      1.00      1.00       360
                          Drug Reaction       1.00      1.00      1.00       360
                    Peptic ulcer diseae       1.00      1.00      1.00       360
                                   AIDS       1.00      1.00      1.00       360
                              Diabetes        1.00      1.00      1.00       360
                        Gastroenteritis       1.00      1.00      1.00       360
                       Bronchial Asthma       1.00      1.00      1.00       360
                          Hypertension        1.00      1.00      1.00       360
                           

In [40]:
# New method
from common import find_outliers

X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

outliers_mask, score = find_outliers(
    X_numpy,
    y_numpy,
    special_model,
    outliers_to_remove=0.1,
    iterations=5,
    gamma=0.5,
    evaluate_loss=metrics.mean_squared_error,
    cv=5,
    repeats=3,
    plot=False
)

X_clean = X_numpy[~outliers_mask]
y_clean = y_numpy[~outliers_mask]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=50)
report = cross_val_classification_report(
    model=special_model,
    X=X_clean,
    y=y_clean,
    cv=cv,
    target_names=y_classes
)
print(report)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4674
           1       0.98      0.94      0.96      1110

    accuracy                           0.98      5784
   macro avg       0.98      0.97      0.98      5784
weighted avg       0.98      0.98      0.98      5784



In [36]:
from common import generate_colors_for_classification
from sklearn.preprocessing import StandardScaler, RobustScaler
from kernel_pca_search import KernelPCASearchCV, kernel_pca_scorer
from render import *

scaler = RobustScaler()
X_ = scaler.fit_transform(X)

transform = KernelPCASearchCV(n_components=3,n_iter=-1)
x_transform = transform.fit_transform(X_)

colors = generate_colors_for_classification(y,seed=100)
data = np.concatenate([x_transform,colors],axis=1)
plot_3d_rgb(data,"Original data plot",["d1","d2","d3"])

X_clean_ = scaler.transform(X_clean)
x_transform = transform.transform(X_clean_)
colors = generate_colors_for_classification(y_clean,seed=100)
data = np.concatenate([x_transform,colors],axis=1)
plot_3d_rgb(data,"Cleaned data plot",["d1","d2","d3"])

print(kernel_pca_scorer(transform,X_))




13 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
13 fits failed with the following error:
Traceback (most recent call last):
  File "/home/vlad/Programs/Git/python-outliers-detector/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/home/vlad/Programs/Git/python-outliers-detector/venv/lib/python3.11/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vlad/Programs/Git/python-outliers-detector/venv/lib/python3.11/site-packages/sklearn/decomposition/_kernel_pca.py", line 446, in fit
    s


X does not have valid feature names, but RobustScaler was fitted with feature names



0.3623449511034926


Compare performance with some control model `SVC` with `rbf` kernel and standard scaler

Find optimal hyperparameters using `RandomSearchCV`