In [67]:
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.discriminant_analysis import StandardScaler
from extract import extract_glcm_features
import os

In [69]:
# Untuk extract fitur dari tiap citra pada folder dataset
# Diubah ke dalam bentuk array yang berisi dictionary
# Tiap dictionary merupakan fitur-fitur dari salah 1 citra
def extract_from_dataset(directory):
    feature_list = []
    for label in os.listdir(directory):
        label_dir = os.path.join(directory, label) # Buka tiap sub-folder dataset (normal & cataract)
        if os.path.isdir(label_dir):
            for image_file in os.listdir(label_dir):
                image_path = os.path.join(label_dir, image_file)
                features = extract_glcm_features(image_path) # Mengekstrak fitur GLCM dari tiap citra
                features['label'] = label # Melabeli citra dengan tipe asal untuk klasifikasi
                feature_list.append(features)
    return feature_list

In [71]:
# Ubah list ke dalam bentuk dataframe
features = extract_from_dataset('dataset')
df = pd.DataFrame(features)

In [75]:
df.head()

Unnamed: 0,contrast,homogeneity,energy,correlation,label
0,"[2.992809198864928, 5.004254851265639, 2.98098...","[0.7439072479198744, 0.7070063731821915, 0.743...","[0.3919405281814336, 0.3799704539638363, 0.391...","[0.9996011297734803, 0.999333093897312, 0.9996...",cataract
1,"[2.7214992334576014, 4.65141015277814, 2.83400...","[0.7372471305472055, 0.6996334801047361, 0.736...","[0.39948639096861405, 0.38771750807280025, 0.3...","[0.9996126257521095, 0.9993379632409186, 0.999...",cataract
2,"[2.345023326829275, 3.699565799121519, 2.30721...","[0.7365179590975013, 0.70393662385973, 0.73966...","[0.4134433682422608, 0.4023591831844324, 0.412...","[0.9993788974097597, 0.9990202009102542, 0.999...",cataract
3,"[1.796594023828878, 2.7783644645834955, 1.8535...","[0.7377793989330507, 0.7036490790807476, 0.741...","[0.41432713542868355, 0.40315748057918316, 0.4...","[0.9992994700377927, 0.9989167687510119, 0.999...",cataract
4,"[2.408935528732475, 3.993428604694216, 2.45887...","[0.7706724801333277, 0.7323765623844868, 0.768...","[0.47914118720129445, 0.47556541209374287, 0.4...","[0.9988540956037645, 0.9981006638296246, 0.998...",cataract


In [76]:
df.tail()

Unnamed: 0,contrast,homogeneity,energy,correlation,label
395,"[5.269424741068839, 9.625348847805839, 5.26324...","[0.699183397494847, 0.6589593126183183, 0.6978...","[0.37868718305104987, 0.36665180433403755, 0.3...","[0.9992465051317319, 0.9986236940479755, 0.999...",normal
396,"[3.6770883089327335, 5.700607619376608, 3.8238...","[0.6995413106342486, 0.6682785564063354, 0.698...","[0.4186097421989255, 0.40940259234870396, 0.41...","[0.9981386846261553, 0.9971146261158514, 0.998...",normal
397,"[3.192721064367939, 5.521911064915017, 3.12463...","[0.7510963195443294, 0.7135930802481628, 0.748...","[0.3970132646829875, 0.3853428960731312, 0.396...","[0.999646748483285, 0.9993890726121228, 0.9996...",normal
398,"[2.9384123908256767, 5.231607919891849, 3.0685...","[0.7462400816707314, 0.7073959813550902, 0.743...","[0.388832083725205, 0.37663715532334313, 0.388...","[0.9996543270277192, 0.9993845892105775, 0.999...",normal
399,"[3.1518644935458724, 5.48047794501336, 3.06634...","[0.7327228973876301, 0.6894882088266594, 0.727...","[0.38418037822302775, 0.3718587450050784, 0.38...","[0.9996053432897303, 0.9993138163514689, 0.999...",normal


In [77]:
# Agar fitur pada setiap data fitur (contrast, energy, dll) pada dataframe tidak berada dalam bentuk array / list
# Di 'flatten', menormalisasi kolom dataframe
def flatten_columns(df: pd.DataFrame, column_name):
    flattened = pd.DataFrame(df[column_name].tolist(), index=df.index)
    flattened.columns = [f"{column_name}_{i}" for i in range(flattened.shape[1])]
    df = df.drop(columns=[column_name])
    df = pd.concat([df, flattened], axis=1)
    return df

feature_columns = ['contrast', 'homogeneity', 'energy', 'correlation']
for col in feature_columns:
    df = flatten_columns(df, col)

df.head()

Unnamed: 0,label,contrast_0,contrast_1,contrast_2,contrast_3,homogeneity_0,homogeneity_1,homogeneity_2,homogeneity_3,energy_0,energy_1,energy_2,energy_3,correlation_0,correlation_1,correlation_2,correlation_3
0,cataract,2.992809,5.004255,2.980989,5.241061,0.743907,0.707006,0.743095,0.705578,0.391941,0.37997,0.391437,0.380007,0.999601,0.999333,0.999603,0.999302
1,cataract,2.721499,4.65141,2.834009,4.713724,0.737247,0.699633,0.736444,0.699506,0.399486,0.387718,0.398941,0.387792,0.999613,0.999338,0.999597,0.999329
2,cataract,2.345023,3.699566,2.307216,3.784402,0.736518,0.703937,0.739667,0.704127,0.413443,0.402359,0.412845,0.402364,0.999379,0.99902,0.999389,0.998998
3,cataract,1.796594,2.778364,1.853599,2.877122,0.737779,0.703649,0.741069,0.703123,0.414327,0.403157,0.413716,0.403113,0.999299,0.998917,0.999277,0.998878
4,cataract,2.408936,3.993429,2.458874,3.696425,0.770672,0.732377,0.768114,0.732097,0.479141,0.475565,0.478909,0.475576,0.998854,0.998101,0.99883,0.998242


In [79]:
# pelabelan
X = df.drop(columns=['label'])
y = df['label']

# partisi data
# 20% untuk testing dan 80% untuk training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
# ubah label ke bentuk numerik | fine tuning 1
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [81]:
# normalisasi fitur | fine tuning 2
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [87]:
# training model knn
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train_scaled, y_train_encoded)

In [91]:
# tes akurasi model
y_pred = knn.predict(X_test_scaled)
y_pred_labels = label_encoder.inverse_transform(y_pred)

accuracy = accuracy_score(y_test, y_pred_labels)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred_labels))

Accuracy: 0.825
              precision    recall  f1-score   support

    cataract       0.88      0.54      0.67        26
      normal       0.81      0.96      0.88        54

    accuracy                           0.82        80
   macro avg       0.84      0.75      0.77        80
weighted avg       0.83      0.82      0.81        80



In [54]:
import joblib

# export model
joblib.dump(knn, 'knn_model.pkl')

# export label
joblib.dump(label_encoder, 'label_encoder.pkl')

# export normalizer
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']