In [15]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

# Step 1: Parse the data into a structured format
data=pd.read_csv("ct5129_dyslexia\\data\\Dyt-desktop.csv")
# Splitting the single column into multiple columns based on the delimiter';'
parsed_data = data.iloc[:, 0].str.split(';', expand=True)

# Step 2: Assign meaningful column names
columns = [
    "Gender", "NativeLang", "OtherLang", "Age",
    *(f"Clicks{i};Hits{i};Misses{i};Score{i};Accuracy{i};Missrate{i}".split(";") for i in range(1, 33)),
    "Dyslexia"
]
parsed_data.columns = [col for sublist in columns for col in (sublist if isinstance(sublist, list) else [sublist])]

# Step 3: Convert relevant columns to numerical and encode categorical features
# Example: Encoding Gender and NativeLang
parsed_data['Gender'] = LabelEncoder().fit_transform(parsed_data['Gender'])
parsed_data['NativeLang'] = LabelEncoder().fit_transform(parsed_data['NativeLang'])
parsed_data['OtherLang'] = LabelEncoder().fit_transform(parsed_data['OtherLang'])

# Convert numeric columns
for col in parsed_data.columns[3:]:  # Skipping first 3 non-numeric columns
    parsed_data[col] = pd.to_numeric(parsed_data[col], errors='coerce')

# Step 4: Handle missing values (if any)
parsed_data.fillna(parsed_data.mean(), inplace=True)

# Step 5: Select features and target
features = parsed_data.drop(columns=["Dyslexia"])
target = parsed_data["Dyslexia"]
target = LabelEncoder().fit_transform(target)  # Encode the target

# Step 6: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Step 7: Train a kNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Step 8: Make predictions and evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep


(1.0,
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00       729\n\n    accuracy                           1.00       729\n   macro avg       1.00      1.00      1.00       729\nweighted avg       1.00      1.00      1.00       729\n')