In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier

def clean_ruspini_dataset(input_file: str, output_file: str = "/content/ruspini.xlsx"):
    """
    Membersihkan dataset Ruspini dari file Excel mentah dan menyimpannya ke file baru.
    Hanya mengambil kolom (#, X, Y, CLASS).

    Parameters:
        input_file (str): Path file input .xlsx (mentah).
        output_file (str): Nama file hasil bersih .xlsx.

    Returns:
        pd.DataFrame: Dataset bersih.
    """
    raw = pd.read_excel(input_file, header=None)

    header_rows = raw[raw.apply(lambda row: row.astype(str).str.contains("CLASS").any(), axis=1)].index.tolist()
    blocks = []
    # Iterate through the header rows to extract data blocks
    for i, idx in enumerate(header_rows):
        # The data starts from the row after the header
        start_row = idx + 1

        # The data ends before the next header row, or at the end of the DataFrame if it's the last header
        end_row = header_rows[i+1] if i < len(header_rows) - 1 else len(raw)

        # Extract the block
        temp = raw.iloc[start_row:end_row].copy() # Use .copy() to avoid SettingWithCopyWarning

        # Find the columns that contain the headers "#", "X", "Y", "CLASS" in the header row (idx)
        header_cols = raw.iloc[idx].astype(str).str.findall(r"#|X|Y|CLASS").apply(lambda x: x[0] if len(x)>0 else None)
        col_indices = {col_name: header_cols[header_cols == col_name].index[0] for col_name in ["#", "X", "Y", "CLASS"] if col_name in header_cols.values}

        # Select the data using the identified column indices and set column names
        if len(col_indices) == 4:
            temp = temp[col_indices.values()].copy()
            temp.columns = ["#", "X", "Y", "CLASS"]
        else:
            # If not all headers are found in this block, skip or handle as needed
            continue

        # Drop rows where all relevant columns are NaN (e.g., empty rows between blocks)
        temp = temp.dropna(subset=["#", "X", "Y", "CLASS"])
        blocks.append(temp)

    # Concatenate all blocks
    clean_data = pd.concat(blocks, ignore_index=True)

    # Convert data types, coercing errors will turn problematic values into NaN
    clean_data = clean_data.astype({"#": pd.Int64Dtype(), "X": pd.Int64Dtype(), "Y": pd.Int64Dtype(), "CLASS": pd.Int64Dtype()})

    # Drop any rows that became NaN after type conversion
    clean_data = clean_data.dropna(subset=["#", "X", "Y", "CLASS"])

    # Simpan dataset bersih
    clean_data.to_excel(output_file, index=False)

    return clean_data

def knn_cross_validation(data: pd.DataFrame, k_neighbors: int = 5, n_splits: int = 5):
    """
    Melakukan K-NN dengan Cross Validation pada dataset bersih.

    Parameters:
        data (pd.DataFrame): Dataset bersih dengan kolom X, Y, CLASS.
        k_neighbors (int): Jumlah tetangga pada KNN.
        n_splits (int): Jumlah fold untuk cross validation.

    Returns:
        dict: Hasil evaluasi (scores, mean, std).
    """
    # Pisahkan fitur dan label
    X = data[["X", "Y"]].values
    y = data["CLASS"].values

    # Inisialisasi KNN
    knn = KNeighborsClassifier(n_neighbors=k_neighbors)

    # Cross Validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = cross_val_score(knn, X, y, cv=kf)

    return {
        "scores": scores,
        "mean_accuracy": np.mean(scores),
        "std_dev": np.std(scores)
    }

if __name__ == "__main__":
    # 1. Bersihkan dataset
    clean_data = clean_ruspini_dataset("/content/ruspini.xlsx", "ruspini_clean.xlsx")
    print("Dataset bersih disimpan ke 'ruspini_clean.xlsx'")
    print(clean_data.head())

    # 2. Jalankan KNN + Cross Validation
    results = knn_cross_validation(clean_data, k_neighbors=5, n_splits=5)
    print("\nHasil Cross Validation:")
    print("Akurasi tiap fold:", results["scores"])
    print("Rata-rata akurasi:", results["mean_accuracy"])
    print("Standar deviasi  :", results["std_dev"])

Dataset bersih disimpan ke 'ruspini_clean.xlsx'
   #   X   Y  CLASS
0  1   4  53      1
1  2   5  63      1
2  3  10  59      1
3  4   9  77      1
4  5  13  49      1

Hasil Cross Validation:
Akurasi tiap fold: [1. 1. 1. 1. 1.]
Rata-rata akurasi: 1.0
Standar deviasi  : 0.0
