# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
student_performance = fetch_ucirepo(id=320) 
  
# data (as pandas dataframes) 
X = student_performance.data.features 
y = student_performance.data.targets 
  
# metadata 
print(student_performance.metadata) 
  
# variable information 
print(student_performance.variables) 


{'uci_id': 320, 'name': 'Student Performance', 'repository_url': 'https://archive.ics.uci.edu/dataset/320/student+performance', 'data_url': 'https://archive.ics.uci.edu/static/public/320/data.csv', 'abstract': 'Predict student performance in secondary education (high school). ', 'area': 'Social Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 649, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': ['Sex', 'Age', 'Other', 'Education Level', 'Occupation'], 'target_col': ['G1', 'G2', 'G3'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2008, 'last_updated': 'Fri Jan 05 2024', 'dataset_doi': '10.24432/C5TG7T', 'creators': ['Paulo Cortez'], 'intro_paper': {'ID': 360, 'type': 'NATIVE', 'title': 'Using data mining to predict secondary school student performance', 'authors': 'P. Cortez, A. M. G. Silva', 'venue': 'Proceedings of 5th Annual Future Business Technolo

In [2]:
import pandas as pd

# check missing values
print("Missing values per column: ")
print(X.isna().sum())

# drop duplicates
X = X.drop_duplicates()
y = y.loc[X.index]

# drop G1 and G2 to avoid target leakage
if {"G1","G2"}.issubset(X.columns):
    X = X.drop(columns=["G1","G2"])

# drop rows with missing values
X = X.dropna()
y = y.loc[X.index]

# convert yes/no columns to 0/1
binary_cols = ["schoolsup","famsup","paid","activities","nursery",
               "higher","internet","romantic"]
for col in binary_cols:
    if col in X.columns:
        X[col] = X[col].map({"yes": 1, "no": 0})

# convert simple ordinal categories
if "famsize" in X.columns:
    X["famsize"] = X["famsize"].map({"LE3": 0, "GT3": 1})
if "address" in X.columns:
    X["address"] = X["address"].map({"U": 1, "R": 0})

# clip outliers in absences
if "absences" in X.columns:
    X["absences"] = X["absences"].clip(upper=40)

# group rare job categories
job_cols = ["Mjob","Fjob"]
for col in job_cols:
    if col in X.columns:
        counts = X[col].value_counts()
        rare = counts[counts < 10].index
        X[col] = X[col].replace(rare, "other")

# group rare "reason" and "guardian" categories
rare_group_cols = ["reason","guardian"]
for col in rare_group_cols:
    if col in X.columns:
        counts = X[col].value_counts()
        rare = counts[counts < 10].index
        X[col] = X[col].replace(rare, "other")

# one-hot encode remaining categoricals
X = pd.get_dummies(X, drop_first=True)

# scale numeric values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# train/test 80/20 split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# run kNN with a variety of K values
from sklearn.neighbors import KNeighborsRegressor

k_values = [1, 3, 5, 7, 9, 11, 15, 21]

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    print(f"k={k} predictions (first 10): {preds[:10]}")


Missing values per column: 
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
dtype: int64
k=1 predictions (first 10): [[16. 14. 14.]
 [15. 16. 17.]
 [15. 14. 15.]
 [15. 16. 17.]
 [13. 14. 13.]
 [13. 14. 14.]
 [14. 14. 14.]
 [ 4.  8.  8.]
 [11. 12. 12.]
 [12. 12. 12.]]
k=3 predictions (first 10): [[15.         14.         14.66666667]
 [13.33333333 14.         15.66666667]
 [13.66666667 13.33333333 14.33333333]
 [12.         12.33333333 13.66666667]
 [12.33333333 12.66666667 12.33333333]
 [14.33333333 14.66666667 15.33333333]
 [16.         16.33333333 16.33333333]