In [59]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use("ggplot")

In [60]:
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold

In [61]:
data = load_iris()
X, y = data.data, data.target

In [62]:
# train test split
X_train, X_test,  y_train, y_test = train_test_split(X,y,train_size=100, shuffle=True, random_state=42)

In [63]:
# classifier
clf = SVC()
clf.fit(X_train, y_train)

In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("train_score  =", clf.score(X_train, y_train))
print("test score = ",clf.score(X_test, y_test))

train_score  = 0.96
test score =  1.0


In [65]:

# compuute precision and recall
y_pred = clf.predict(X_test)

In [76]:
n = y_test.shape[0]
def precision(label):
    return len([i for i in range(n) if y_pred[i] ==label and y_test[i] == label])/ len([ i for i in range(n) if y_pred[i] == label ])
precisions = {
    "O" : precision(0),
    "1" : precision(1),
    "2" : precision(2)
}
precisions

{'O': 1.0, '1': 1.0, '2': 1.0}

In [74]:
def recall(label):
    return len([i for i in range(n) if y_pred[i] ==label and y_test[i] == label])/ len([ i for i in range(n) if y_test[i] == label ])
recalls = {
    "O" : recall(0),
    "1" : recall(1),
    "2" : recall(2)
}
recalls

{'O': 1.0, '1': 1.0, '2': 1.0}

In [81]:
from sklearn.metrics import classification_report
from pprint import PrettyPrinter
classification_report = classification_report(y_test, y_pred)
PrettyPrinter(sort_dicts=True).pprint(classification_report)

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       1.00      1.00      1.00        19\n'
 '           1       1.00      1.00      1.00        15\n'
 '           2       1.00      1.00      1.00        16\n'
 '\n'
 '    accuracy                           1.00        50\n'
 '   macro avg       1.00      1.00      1.00        50\n'
 'weighted avg       1.00      1.00      1.00        50\n')


In [91]:
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
cross_val_scores = cross_validate(
    SVC(),
    X_train,
    y_train,
    cv=10,
    scoring=["precision_macro", "recall_macro"],
    return_estimator=True
)
PrettyPrinter(indent=2, sort_dicts=True).pprint(cross_val_scores)

{ 'estimator': [ SVC(),
                 SVC(),
                 SVC(),
                 SVC(),
                 SVC(),
                 SVC(),
                 SVC(),
                 SVC(),
                 SVC(),
                 SVC()],
  'fit_time': array([0.00232005, 0.00086784, 0.00154591, 0.00104094, 0.00048614,
       0.00040507, 0.00039124, 0.00038719, 0.00037789, 0.00034714]),
  'score_time': array([0.00405002, 0.00341296, 0.00333095, 0.00162601, 0.00118089,
       0.00115204, 0.00113893, 0.00112677, 0.00111508, 0.00101709]),
  'test_precision_macro': array([1.        , 1.        , 1.        , 0.83333333, 0.91666667,
       1.        , 1.        , 1.        , 1.        , 0.91666667]),
  'test_recall_macro': array([1.        , 1.        , 1.        , 0.75      , 0.91666667,
       1.        , 1.        , 1.        , 1.        , 0.88888889])}


In [111]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters
grid_params = {
    "kernel" : ["linear", "poly", "rbf", "sigmoid"],
    "degree" : [2, 3, 4],
    "gamma" : ["scale", "auto"],
    "coef0" : [0.0, 0.1, 0.3, 0.5],
    "C" : [1, 3, 0.5]
}

# Initialize GridSearchCV with accuracy as the scoring metric
grid = GridSearchCV(
    SVC(),
    param_grid=grid_params,
    scoring="accuracy",  # Use accuracy for scoring
    verbose=1,
    error_score=0,
    refit=True  # By default, refits the best model based on accuracy
)

# Fit the model to the training data
grid.fit(X_train, y_train)

# Get the best parameters based on accuracy
print("Best parameters based on accuracy:", grid.best_params_)


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters based on accuracy: {'C': 1, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


In [114]:
import pandas as pd
cv_results = pd.DataFrame(grid.cv_results_).to_csv("/Users/mac/Desktop/intro_ML/tp_svm/cv_results.csv")

# Chronic Kidney desease classification using SVM

In [115]:
import pandas as pd

In [397]:
data_csv = pd.read_csv("data/ChronicKidneyDisease.csv", index_col= "id")
print(data_csv.info())
data_csv.columns.to_list()

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    object 


['age',
 'bp',
 'sg',
 'al',
 'su',
 'rbc',
 'pc',
 'pcc',
 'ba',
 'bgr',
 'bu',
 'sc',
 'sod',
 'pot',
 'hemo',
 'pcv',
 'wc',
 'rc',
 'htn',
 'dm',
 'cad',
 'appet',
 'pe',
 'ane',
 'classification']

In [126]:
# arff format load
with open("data/chronic_kidney_disease/chronic_kidney_disease_full.arff", 'r') as f:
    file = f.read()
print(len(file))


46709


In [127]:
from scipy.io.arff import loadarff

In [137]:
data, meta_data = loadarff("data/chronic_kidney_disease/chronic_kidney_disease_full.arff")
print(meta_data)


Dataset: Chronic_Kidney_Disease
	age's type is numeric
	bp's type is numeric
	sg's type is nominal, range is ('1.005', '1.010', '1.015', '1.020', '1.025')
	al's type is nominal, range is ('0', '1', '2', '3', '4', '5')
	su's type is nominal, range is ('0', '1', '2', '3', '4', '5')
	rbc's type is nominal, range is ('normal', 'abnormal')
	pc's type is nominal, range is ('normal', 'abnormal')
	pcc's type is nominal, range is ('present', 'notpresent')
	ba's type is nominal, range is ('present', 'notpresent')
	bgr's type is numeric
	bu's type is numeric
	sc's type is numeric
	sod's type is numeric
	pot's type is numeric
	hemo's type is numeric
	pcv's type is numeric
	wbcc's type is numeric
	rbcc's type is numeric
	htn's type is nominal, range is ('yes', 'no')
	dm's type is nominal, range is ('yes', 'no')
	cad's type is nominal, range is ('yes', 'no')
	appet's type is nominal, range is ('good', 'poor')
	pe's type is nominal, range is ('yes', 'no')
	ane's type is nominal, range is ('yes', 'no'

In [143]:
pd_data = pd.DataFrame(
    data = data, 
    columns= [
        'age',
        'bp',
        'sg',
        'al',
        'su',
        'rbc',
        'pc',
        'pcc',
        'ba',
        'bgr',
        'bu',
        'sc',
        'sod',
        'pot',
        'hemo',
        'pcv',
        'wc',
        'rc',
        'htn',
        'dm',
        'cad',
        'appet',
        'pe',
        'ane',
        'classification'])
print(pd_data.info())
pd_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              400 non-null    object 
 3   al              400 non-null    object 
 4   su              400 non-null    object 
 5   rbc             400 non-null    object 
 6   pc              400 non-null    object 
 7   pcc             400 non-null    object 
 8   ba              400 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             329 non-null    float64
 16  wc              0 non-null      object 
 17  rc              0 non-null      obj

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,b'1.020',b'1',b'0',b'?',b'normal',b'notpresent',b'notpresent',121.0,...,44.0,,,b'yes',b'yes',b'no',b'good',b'no',b'no',
1,7.0,50.0,b'1.020',b'4',b'0',b'?',b'normal',b'notpresent',b'notpresent',,...,38.0,,,b'no',b'no',b'no',b'good',b'no',b'no',
2,62.0,80.0,b'1.010',b'2',b'3',b'normal',b'normal',b'notpresent',b'notpresent',423.0,...,31.0,,,b'no',b'yes',b'no',b'poor',b'no',b'yes',
3,48.0,70.0,b'1.005',b'4',b'0',b'normal',b'abnormal',b'present',b'notpresent',117.0,...,32.0,,,b'yes',b'no',b'no',b'poor',b'yes',b'yes',
4,51.0,80.0,b'1.010',b'2',b'0',b'normal',b'normal',b'notpresent',b'notpresent',106.0,...,35.0,,,b'no',b'no',b'no',b'good',b'no',b'no',


# ARFF Data has TWO columns filled with None Values, we decided to use csv

In [424]:
# we will work with the csv file


def wrangle(data : pd.DataFrame) :
    """cleans data before preprocessing

    Args:
        data (pd.DataFrame): input data frame

    Returns:
        pd.DataFrame : data cleaned and ready to feed to the preprocessor
    """
    #decode String
    clean_data = data.select_dtypes('object').apply(lambda col : col.astype(str).str.replace(r'\t', '', regex=True))
    clean_data = data.apply(pd.to_numeric, errors = "ignore")
    #fill nan with np.nan (even they're handled the same way)
    clean_data = clean_data.fillna(np.nan)
    #fill nan values with the most found value
    clean_data = clean_data.apply(lambda col : col.fillna(col.mode().iloc[0]))
    
    return clean_data

data_clean = wrangle(data_csv)

# X.select_dtypes("object").apply(lambda col: col.nunique()) # some columns could not be trasnformed because some data is badly wirtten

data_clean[['pcv', 'rc', 'wc']] = data_clean[['pcv', 'rc', 'wc']].apply(pd.to_numeric, errors="coerce")
data_clean.dropna(inplace=True)
X = data_clean.drop(columns=['classification'])
X[['dm', 'cad']] = X[['dm', 'cad']].apply(lambda col : col.astype(str).str.replace(r'\t', '', regex=True).str.strip())
y = data_clean['classification']

y = y.replace({"ckd\t" : "ckd"}) #y has the same class written with two different ways
# y.unique()


In [425]:
#scaling and one hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_columns = X.select_dtypes("object").columns.to_list()
numerical_columns = X.select_dtypes('number').drop(columns=['sg', 'al', 'su']).columns.to_list() # we drop them because even they're numerical they still stand for categories

preprocessor = ColumnTransformer(
    transformers = [
        ("onehotencoder", OneHotEncoder(),  categorical_columns),
        ("standardscaler", StandardScaler(), numerical_columns)
    ]
)

In [436]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, precision_score

pipeline = make_pipeline(
        preprocessor,
        SVC()
)
param_grid = {
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Different kernels to test
    'svc__C': [0.1, 1, 10],  # Regularization parameter
    'svc__gamma': ['scale', 'auto', 0.1, 1]  # Gamma values (relevant for 'rbf' and 'poly')
}
scoring_metrics = {
    'accuracy': 'accuracy',
    'recall': make_scorer(recall_score, average='macro'),
    'precision': make_scorer(precision_score, average='macro', zero_division=1)
}

#setting our grid search Cv 
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5), 
    verbose=1,
    scoring=scoring_metrics,
    refit="accuracy"
)

In [437]:
# split train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [438]:
grid_search.fit(X_train, y_train)

# Extract grid search results
results = grid_search.cv_results_

# Prepare a DataFrame with the results
df_results = pd.DataFrame(results)

df_results

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__gamma,param_svc__kernel,params,split0_test_accuracy,split1_test_accuracy,...,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision
0,0.008643,0.005584,0.014294,0.014747,0.1,scale,linear,"{'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__k...",0.96875,0.984375,...,0.009632,12,0.961538,0.98,0.98,0.944444,0.961538,0.965504,0.013381,14
1,0.003948,0.000209,0.002981,0.000254,0.1,scale,poly,"{'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__k...",0.984375,0.953125,...,0.022951,20,0.98,0.953871,0.98,0.953261,1.0,0.973426,0.017786,9
2,0.004042,0.000311,0.003315,5.8e-05,0.1,scale,rbf,"{'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__k...",0.953125,0.984375,...,0.020468,21,0.944444,0.98,0.98,0.913793,0.98,0.959648,0.026745,22
3,0.004206,0.00036,0.003241,0.000252,0.1,scale,sigmoid,"{'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__k...",0.90625,0.921875,...,0.025815,37,0.9,0.913793,0.887097,0.863636,0.944444,0.901794,0.026975,39
4,0.003513,0.000337,0.002709,7.2e-05,0.1,auto,linear,"{'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__ke...",0.96875,0.984375,...,0.009632,12,0.961538,0.98,0.98,0.944444,0.961538,0.965504,0.013381,14
5,0.004022,8.4e-05,0.002852,5.2e-05,0.1,auto,poly,"{'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__ke...",0.625,0.625,...,0.0,46,0.8125,0.8125,0.809524,0.809524,0.809524,0.810714,0.001458,44
6,0.00379,0.000163,0.003481,0.000159,0.1,auto,rbf,"{'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__ke...",0.9375,0.96875,...,0.022868,32,0.928571,0.961538,0.961538,0.9,0.98,0.94633,0.028489,32
7,0.004228,0.000112,0.002874,3.6e-05,0.1,auto,sigmoid,"{'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__ke...",0.90625,0.9375,...,0.026474,36,0.9,0.928571,0.913793,0.863636,0.944444,0.910089,0.027551,38
8,0.003199,9.4e-05,0.002705,0.000136,0.1,0.1,linear,"{'svc__C': 0.1, 'svc__gamma': 0.1, 'svc__kerne...",0.96875,0.984375,...,0.009632,12,0.961538,0.98,0.98,0.944444,0.961538,0.965504,0.013381,14
9,0.003403,0.000157,0.002822,0.000203,0.1,0.1,poly,"{'svc__C': 0.1, 'svc__gamma': 0.1, 'svc__kerne...",0.96875,0.984375,...,0.007907,1,0.961538,0.98,0.98,0.98,1.0,0.980308,0.012168,1
