# IN PROGRESS!!!!! dzisiaj skończe :)

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Classification of a Heart with Hypertrophic Cardiomyopathy (Cardiomegaly)

### Specify data file path

In [6]:
csv_path = "task_data.csv"

## Load data

Make sure to specify correct decimal separator

In [15]:
df = pd.read_csv(csv_path, decimal=',')

print("Loaded:", df.shape)
print("Columns:", list(df.columns)[:14])

id_col = df.columns[0]
label_col = df.columns[1]

X = df.drop(columns=[id_col, label_col])
y = df[label_col]


Loaded: (37, 14)
Columns: ['ID', 'Cardiomegaly', 'Heart width', 'Lung width', 'CTR - Cardiothoracic Ratio', 'xx', 'yy', 'xy', 'normalized_diff', 'Inscribed circle radius', 'Polygon Area Ratio', 'Heart perimeter', 'Heart area ', 'Lung area']


## Split the data into trainting and test sets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("\nTrain/test shapes:", X_train.shape, X_test.shape)

print("Train label distribution:\n", y_train.value_counts())
print("Test  label distribution:\n", y_test.value_counts())


Train/test shapes: (29, 12) (8, 12)
Train label distribution:
 Cardiomegaly
1    22
0     7
Name: count, dtype: int64
Test  label distribution:
 Cardiomegaly
1    6
0    2
Name: count, dtype: int64


## KNN model

As you can see accuract isn't the best. Also data set is very small, only 29 samples in train set. Also it is imbalanced as only 7 samples represents healthy hearts.

In [31]:
pipe_knn = Pipeline(steps=[
    ("scaler", StandardScaler()),     
    ("model", KNeighborsClassifier( 
        n_neighbors=3,             
        weights='distance',        
        metric='manhattan'
    ))
])

pipe_knn.fit(X_train, y_train)

cv_scores = cross_val_score(pipe_knn, X_train, y_train, cv=5)

print("Accuracy per fold:", np.round(cv_scores, 3))
print(f"Mean CV accuracy: {cv_scores.mean():.3f}")
print(f"Standard deviation: {cv_scores.std():.3f}")

Accuracy per fold: [0.833 0.833 0.833 0.667 0.6  ]
Mean CV accuracy: 0.753
Standard deviation: 0.100


First lets improve validation method. This method is better for small datasets as it repeats the KFold validation multiple times. As you can see we estimated a bit higher accuracy this time.

In [37]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

scores = cross_val_score(pipe_knn, X_train, y_train, cv=cv)

print(f"Mean CV accuracy: {scores.mean():.3f}")
print(f"Standard deviation: {scores.std():.3f}")

Mean CV accuracy: 0.773
Standard deviation: 0.144


Now lets try to optimize model hyperparameters for better accuracy.

In [42]:
param_grid = {
    'model__n_neighbors': [1, 3, 5, 7, 9, 11, 15],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev']
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

pipe_knn = Pipeline(steps=[
    ("scaler", StandardScaler()),           
    ("model", KNeighborsClassifier())     
])

grid = GridSearchCV(
    estimator=pipe_knn,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1 
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print(f"Best cross-val accuracy: {grid.best_score_:.3f}")

Best parameters: {'model__metric': 'euclidean', 'model__n_neighbors': 1, 'model__weights': 'uniform'}
Best cross-val accuracy: 0.831


I found that the best parameters for KNN are: `{'model__metric': 'euclidean', 'model__n_neighbors': 1, 'model__weights': 'uniform'}`

We achieved a significantly better accuracy — an improvement of about 6%.

Lets save this model for later evaluation.

In [43]:
best_knn = grid.best_estimator_