In [None]:
import kagglehub
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Download latest version
path = kagglehub.dataset_download("spscientist/students-performance-in-exams")
print("Dataset downloaded from Kaggle to:", path)

# Check for the CSV file
csv_file = os.path.join(path, "StudentsPerformance.csv")
df = pd.read_csv(csv_file)
print("\nFirst 3 rows of the dataset:\n", df.head(3))

# Encode categorical features
df['gender'] = df['gender'].map({'male': 0, 'female': 1})
df['race/ethnicity'] = df['race/ethnicity'].map({
    'group A': 0, 'group B': 1, 'group C': 2, 'group D': 3, 'group E': 4
})
df['parental level of education'] = df['parental level of education'].map({
    'some high school': 0, 'high school': 1, 'some college': 2,
    "associate's degree": 3, 'bachelor\'s degree': 4, 'master\'s degree': 5
})
df['lunch'] = df['lunch'].map({'standard': 0, 'free/reduced': 1})
df['test preparation course'] = df['test preparation course'].map({'none': 0, 'completed': 1})

# Prepare features and labels
features = [
    "gender", "race/ethnicity", "parental level of education", 
    "lunch", "test preparation course", "math score", 
    "reading score", "writing score"
]
X = df[features].values

# Calculate total score and categorize into labels
df['total_score'] = df['math score'] + df['reading score'] + df['writing score']

def categorize_score(total_score):
    if total_score < 150:
        return 0  # Fail
    elif 150 <= total_score < 200:
        return 1  # Average
    elif 200 <= total_score < 250:
        return 2  # Good
    else:
        return 3  # Excellent

df['label'] = df['total_score'].apply(categorize_score)
y = df['label'].values

# Display class distribution
print("\nClass distribution in the dataset:")
print(df['label'].value_counts())

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTraining and testing data prepared.")

# KNN Implementation
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def knn_predict(X_train, y_train, X_test, k=5):
    predictions = []
    for test_point in X_test:
        distances = []
        for i, train_point in enumerate(X_train):
            dist = euclidean_distance(test_point, train_point)
            distances.append((dist, y_train[i]))
        distances.sort(key=lambda x: x[0])
        nearest_neighbors = distances[:k]
        classes = [label for _, label in nearest_neighbors]
        predicted_class = np.bincount(classes).argmax()
        predictions.append(predicted_class)
    return np.array(predictions)

# Run KNN predictions
print("\nRunning KNN predictions (k=5)...")
y_pred = knn_predict(X_train, y_train, X_test, k=5)

# Save the model
model_data = {
    "X_train": X_train,
    "y_train": y_train,
    "k": 5
}
with open("student_knn_multiclass_model.pkl", "wb") as file:
    pickle.dump(model_data, file)
print("\nModel saved as 'student_knn_multiclass_model.pkl'.")

# Predict for a new data point
def load_model_and_predict(new_data_point):
    with open("student_knn_multiclass_model.pkl", "rb") as file:
        model_data = pickle.load(file)
    
    X_train = model_data["X_train"]
    y_train = model_data["y_train"]
    k = model_data["k"]
    
    prediction = knn_predict(X_train, y_train, np.array([new_data_point]), k=k)
    return prediction[0]

# Example prediction
new_data_point = [1, 2, 4, 0, 1, 85, 90, 95]
new_prediction = load_model_and_predict(new_data_point)
class_names = ["Fail", "Average", "Good", "Excellent"]
print(f"\nPrediction for new data point {new_data_point}: Class {new_prediction} ({class_names[new_prediction]})")

# KNN Predictions vs. Actual
print("\nKNN Predictions vs. Actual Labels:")
comparison_df = pd.DataFrame({
    "Actual Label": y_test,
    "Predicted Label": y_pred,
    "Actual Class": [class_names[label] for label in y_test],
    "Predicted Class": [class_names[label] for label in y_pred]
})
print(comparison_df.head(10))  # Display first 10 comparisons

# Calculate and print accuracy
accuracy = np.mean(y_test == y_pred) * 100
print(f"\nKNN Accuracy: {accuracy:.2f}%")

# Visualization: Correct vs Incorrect Predictions
comparison_df["Correct"] = comparison_df["Actual Label"] == comparison_df["Predicted Label"]

correct_counts = comparison_df["Correct"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=correct_counts.index, y=correct_counts.values, palette=["red", "green"])
plt.title("KNN Prediction Results: Correct vs Incorrect")
plt.xticks(ticks=[0, 1], labels=["InCorrect", "Correct"])
plt.ylabel("Number of Predictions")
plt.xlabel("Correct vs Incorrect")
plt.show()
