In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pickle


In [17]:
# Load dataset
df = pd.read_csv("StudentsPerformance.csv")

# Melihat struktur data
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
# Informasi dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [4]:
# Statistik deskriptif
df.describe()


Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [5]:
# Menghapus data duplikat
df = df.drop_duplicates()

# Mengecek missing value
df.isnull().sum()


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [6]:
# Membuat kolom rata-rata nilai
df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Membuat target klasifikasi
df['risk'] = df['average_score'].apply(lambda x: 1 if x < 60 else 0)

df[['average_score', 'risk']].head()


Unnamed: 0,average_score,risk
0,72.666667,0
1,82.333333,0
2,92.666667,0
3,49.333333,1
4,76.333333,0


In [7]:
le = LabelEncoder()

categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [8]:
X = df.drop(['risk', 'average_score'], axis=1)
y = df['risk']


In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [11]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)


In [12]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)


In [13]:
print("=== Evaluasi KNN ===")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


=== Evaluasi KNN ===
Accuracy: 0.935
[[135   3]
 [ 10  52]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       138
           1       0.95      0.84      0.89        62

    accuracy                           0.94       200
   macro avg       0.94      0.91      0.92       200
weighted avg       0.94      0.94      0.93       200



In [14]:
print("=== Evaluasi Naive Bayes ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


=== Evaluasi Naive Bayes ===
Accuracy: 0.95
[[129   9]
 [  1  61]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96       138
           1       0.87      0.98      0.92        62

    accuracy                           0.95       200
   macro avg       0.93      0.96      0.94       200
weighted avg       0.95      0.95      0.95       200



In [19]:
import pickle

# Simpan model KNN
with open('model_knn.pkl', 'wb') as f:
    pickle.dump(knn, f)

# Simpan model Naive Bayes
with open('model_nb.pkl', 'wb') as f:
    pickle.dump(nb, f)

print("Model berhasil disimpan:")
print("- model_knn.pkl")
print("- model_nb.pkl")


Model berhasil disimpan:
- model_knn.pkl
- model_nb.pkl
