In [1]:
import os
import pandas as pd
import numpy as np
from extract_vanilla import extract
from utils.flatten import flatten_columns

from sklearn.calibration import LabelEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# extract features from each images
def extract_from_dataset(directory):
    data = []
    for label in os.listdir(directory): # open each sub folder in the dataset
        label_dir = os.path.join(directory, label)
        if os.path.isdir(label_dir):
            for image_file in os.listdir(label_dir):
                image_path = os.path.join(label_dir, image_file)
                features = extract(image_path)
                features['label'] = label # image labeling
                data.append(features)
    return data

In [4]:
dataset_path = 'dataset'
data = extract_from_dataset(dataset_path)

In [5]:
# convert list to dataframe
df = pd.DataFrame(data)

In [6]:
df.head()

Unnamed: 0,contrast,homogeneity,energy,correlation,label
0,"[5647.430821078479, 5753.671049596006, 5651.23...","[0.03876188957183313, 0.034914943251424524, 0....","[0.009302847447273882, 0.008963130954050453, 0...","[16.26881970055569, 15.720533725715503, 16.194...",cataract
1,"[5656.720036764689, 5718.2521184157085, 5647.9...","[0.04012935806284118, 0.03536837638986996, 0.0...","[0.009444620217210445, 0.009112843597308545, 0...","[17.243682753822856, 16.962286931427307, 17.25...",cataract
2,"[5528.260968137255, 5665.312556708693, 5441.17...","[0.041259554208304294, 0.03646657766911014, 0....","[0.010293502148089214, 0.009943581618168612, 0...","[20.60318426550338, 19.85738994541593, 21.0578...",cataract
3,"[5524.710845588194, 5663.947589388437, 5445.09...","[0.04214806407783401, 0.03712050978267664, 0.0...","[0.010382956194053265, 0.010025326377059721, 0...","[21.046386931769938, 20.245786824035758, 21.42...",cataract
4,"[6484.165609681379, 6736.247043444595, 6373.86...","[0.354220109901474, 0.3451964776358063, 0.3564...","[0.31832354908489713, 0.31221713297725806, 0.3...","[52.46914599196971, 50.744961081612175, 53.211...",cataract


In [7]:
def flatten_columns(df: pd.DataFrame, column_name):
    flattened = pd.DataFrame(df[column_name].tolist(), index=df.index)
    flattened.columns = [f"{column_name}_{i*45}" for i in range(flattened.shape[1])]
    df = df.drop(columns=[column_name])
    df = pd.concat([df, flattened], axis=1)
    return df

feature_columns = ['contrast', 'homogeneity', 'energy', 'correlation']
for col in feature_columns:
    df = flatten_columns(df, col)

In [8]:
df.head()

Unnamed: 0,label,contrast_0,contrast_45,contrast_90,contrast_135,homogeneity_0,homogeneity_45,homogeneity_90,homogeneity_135,energy_0,energy_45,energy_90,energy_135,correlation_0,correlation_45,correlation_90,correlation_135
0,cataract,5647.430821,5753.67105,5651.233609,5737.295871,0.038762,0.034915,0.039699,0.034492,0.009303,0.008963,0.009241,0.008938,16.26882,15.720534,16.194603,15.810678
1,cataract,5656.720037,5718.252118,5647.954488,5828.163245,0.040129,0.035368,0.040288,0.035583,0.009445,0.009113,0.009429,0.009082,17.243683,16.962287,17.259965,16.350353
2,cataract,5528.260968,5665.312557,5441.179887,5627.239523,0.04126,0.036467,0.041945,0.036792,0.010294,0.009944,0.010287,0.009969,20.603184,19.85739,21.057853,20.07663
3,cataract,5524.710846,5663.947589,5445.092969,5673.753987,0.042148,0.037121,0.041665,0.037361,0.010383,0.010025,0.010336,0.010017,21.046387,20.245787,21.426571,20.188657
4,cataract,6484.16561,6736.247043,6373.861091,6688.711419,0.35422,0.345196,0.356447,0.345236,0.318324,0.312217,0.32104,0.312106,52.469146,50.744961,53.211084,51.051152


In [28]:
csv_file_path = 'dataset_1.csv'
df.to_csv(csv_file_path, index=False)

In [24]:
# labeling
X = df.drop(columns=['label']) # feature
y = df['label'] # label

# data partition
# 20% testing, 80% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [25]:
# encode label to numerical value
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [26]:
# normalize 
scaler = StandardScaler()
'''
x scaled = x - mean / sd
'''
X_train_scaled = scaler.fit_transform(X_train) # computes the mean and standard deviation for each feature
X_test_scaled = scaler.transform(X_test) # use scaled model

In [27]:
# train
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto')
knn.fit(X_train_scaled, y_train_encoded)

In [30]:
# test prediction
y_pred = knn.predict(X_test_scaled)
y_pred_labels = label_encoder.inverse_transform(y_pred)

accuracy = accuracy_score(y_test, y_pred_labels)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred_labels))

Accuracy: 0.45
              precision    recall  f1-score   support

    cataract       0.60      0.46      0.52        26
      normal       0.30      0.43      0.35        14

    accuracy                           0.45        40
   macro avg       0.45      0.45      0.44        40
weighted avg       0.49      0.45      0.46        40



In [52]:
import joblib

# export model
joblib.dump(knn, 'models_vanilla/knn_model.pkl')

# export label
joblib.dump(label_encoder, 'models_vanilla/label_encoder.pkl')

# export normalizer
joblib.dump(scaler, 'models_vanilla/scaler.pkl')

['models/scaler.pkl']