# Installing Dependencies

In [46]:
#install libraries for colab compatability
!pip install pandas numpy scikit-learn ucimlrepo



# Downloading Dataset/Describing Variables

In [47]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
student_performance = fetch_ucirepo(id=320)

# data (as pandas dataframes)
X = student_performance.data.features
y = student_performance.data.targets

# metadata
print(student_performance.metadata)

# variable information
print(student_performance.variables)

{'uci_id': 320, 'name': 'Student Performance', 'repository_url': 'https://archive.ics.uci.edu/dataset/320/student+performance', 'data_url': 'https://archive.ics.uci.edu/static/public/320/data.csv', 'abstract': 'Predict student performance in secondary education (high school). ', 'area': 'Social Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 649, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': ['Sex', 'Age', 'Other', 'Education Level', 'Occupation'], 'target_col': ['G1', 'G2', 'G3'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2008, 'last_updated': 'Fri Jan 05 2024', 'dataset_doi': '10.24432/C5TG7T', 'creators': ['Paulo Cortez'], 'intro_paper': {'ID': 360, 'type': 'NATIVE', 'title': 'Using data mining to predict secondary school student performance', 'authors': 'P. Cortez, A. M. G. Silva', 'venue': 'Proceedings of 5th Annual Future Business Technolo

# Cleaning Data

In [48]:
import pandas as pd

# check missing values
print("Missing values per column: ")
print(X.isna().sum())

# drop duplicates
X = X.drop_duplicates()
y = y.loc[X.index]

# drop G1 and G2 to avoid target leakage, as these are just the semester grades that make up final grade
if {"G1","G2"}.issubset(X.columns):
    X = X.drop(columns=["G1","G2"])

# drop rows with missing values
X = X.dropna()
y = y.loc[X.index]

# convert yes/no columns to 0/1
binary_cols = ["schoolsup","famsup","paid","activities","nursery",
               "higher","internet","romantic"]
for col in binary_cols:
    if col in X.columns:
        X[col] = X[col].map({"yes": 1, "no": 0})

# convert simple ordinal categories (LE3 is confusing for example)
if "famsize" in X.columns:
    X["famsize"] = X["famsize"].map({"LE3": 0, "GT3": 1})
if "address" in X.columns:
    X["address"] = X["address"].map({"U": 1, "R": 0})

# clip outliers in absences (some are abnormally high at > 100)
if "absences" in X.columns:
    X["absences"] = X["absences"].clip(upper=40)

# group rare job categories to limit number of categorical values
job_cols = ["Mjob","Fjob"]
for col in job_cols:
    if col in X.columns:
        counts = X[col].value_counts()
        rare = counts[counts < 10].index
        X[col] = X[col].replace(rare, "other")

# group rare "reason" and "guardian" categories to limit number of categorical values
rare_group_cols = ["reason","guardian"]
for col in rare_group_cols:
    if col in X.columns:
        counts = X[col].value_counts()
        rare = counts[counts < 10].index
        X[col] = X[col].replace(rare, "other")

# one-hot encode remaining categoricals
X = pd.get_dummies(X, drop_first=True)

# check zero-variance columns
zero_var = X.columns[X.nunique() <= 1]
print("Zero variance columns:", list(zero_var))

# realign y after all X cleaning
y = y.loc[X.index]

# scale numeric values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# train/test 80/20 split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


Missing values per column: 
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
dtype: int64
Zero variance columns: []


# Training KNN Model

In [49]:
# run kNN with a variety of K values
from sklearn.neighbors import KNeighborsRegressor

k_values = [1, 3, 5, 7, 9, 11, 15, 21,35,42,50]

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    print(f"k={k} predictions (first 10): {preds[:10]}")

k=1 predictions (first 10): [[16. 14. 14.]
 [15. 16. 17.]
 [15. 14. 15.]
 [15. 16. 17.]
 [13. 14. 13.]
 [13. 14. 14.]
 [14. 14. 14.]
 [ 4.  8.  8.]
 [11. 12. 12.]
 [12. 12. 12.]]
k=3 predictions (first 10): [[15.         14.         14.66666667]
 [13.33333333 14.         15.66666667]
 [13.66666667 13.33333333 14.33333333]
 [12.         12.33333333 13.66666667]
 [12.33333333 12.66666667 12.33333333]
 [14.33333333 14.66666667 15.33333333]
 [16.         16.33333333 16.33333333]
 [ 8.         11.         11.33333333]
 [12.33333333 11.66666667 12.        ]
 [10.66666667 11.         11.33333333]]
k=5 predictions (first 10): [[13.6 13.2 13.8]
 [13.4 14.6 15.8]
 [14.2 14.2 14.8]
 [13.2 13.  14.2]
 [12.  12.2 12.2]
 [13.8 13.8 14.4]
 [13.4 14.2 14.2]
 [ 8.6 10.2 10.4]
 [13.6 13.4 13.8]
 [11.4 11.8 12. ]]
k=7 predictions (first 10): [[13.71428571 13.71428571 14.28571429]
 [12.71428571 13.28571429 14.28571429]
 [14.28571429 14.         14.42857143]
 [12.57142857 12.28571429 13.28571429]
 [11.5714

KNN Results

In [50]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Evaluate kNN on the final grade G3 to match advanced models
target_col = "G3" if "G3" in y_train.columns else y_train.columns[-1]
target_idx = list(y_train.columns).index(target_col)

knn_results = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)

    # use only the G3 (final grade) column for evaluation
    y_true = y_test[target_col].values
    preds_target = preds[:, target_idx]

    mse = mean_squared_error(y_true, preds_target)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, preds_target)

    knn_results.append((k, rmse, r2))
    print(f"k={k}: RMSE={rmse:.3f}, R-squared={r2:.3f}")

# Results in DataFrame
knn_results_df = pd.DataFrame(knn_results, columns=["k", "RMSE", "R2"])
display(knn_results_df)

# Select k with the lowest RMSE
best_idx = knn_results_df["RMSE"].idxmin()
best_row = knn_results_df.loc[best_idx]

best_k = best_row["k"]
knn_rmse = best_row["RMSE"]
knn_r2 = best_row["R2"]

print(f"\nBest kNN model: k={best_k:.0f}, RMSE={knn_rmse:.3f}, R-squared={knn_r2:.3f}")

k=1: RMSE=3.729, R-squared=-0.426
k=3: RMSE=3.195, R-squared=-0.047
k=5: RMSE=3.024, R-squared=0.063
k=7: RMSE=2.897, R-squared=0.139
k=9: RMSE=2.880, R-squared=0.150
k=11: RMSE=2.912, R-squared=0.130
k=15: RMSE=2.892, R-squared=0.142
k=21: RMSE=2.844, R-squared=0.170
k=35: RMSE=2.840, R-squared=0.173
k=42: RMSE=2.858, R-squared=0.162
k=50: RMSE=2.871, R-squared=0.155


Unnamed: 0,k,RMSE,R2
0,1,3.729302,-0.426179
1,3,3.194948,-0.046759
2,5,3.023549,0.06254
3,7,2.896921,0.139418
4,9,2.87962,0.149666
5,11,2.91228,0.130269
6,15,2.892365,0.142122
7,21,2.84414,0.170491
8,35,2.840148,0.172818
9,42,2.857992,0.162392



Best kNN model: k=35, RMSE=2.840, R-squared=0.173


# Complex Model

According to our feedback, we want to add an additional model that is more complex than KKN and analyze its success.

In [51]:
# Target for regression (final grade G3)
if "G3" in y.columns:
    y_target = y["G3"]
else:
    # use last column if names differ
    y_target = y.iloc[:, -1]

Split for advanced models

In [52]:
from sklearn.model_selection import train_test_split

# New split for advanced models (keeps X as a DataFrame)
X_train_adv, X_test_adv, y_train_adv, y_test_adv = train_test_split(
    X, y_target, test_size=0.2, random_state=42
)

Preprocessing for advanced models

In [53]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(exclude=['int64', 'float64']).columns

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

Decision Tree Regressor

In [54]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

tree_model = Pipeline(steps=[
    ('prep', preprocess),
    ('model', DecisionTreeRegressor(random_state=42))
])

tree_model.fit(X_train_adv, y_train_adv)
tree_preds = tree_model.predict(X_test_adv)

tree_mse = mean_squared_error(y_test_adv, tree_preds)
tree_rmse = np.sqrt(tree_mse)
tree_r2 = r2_score(y_test_adv, tree_preds)

print("Decision Tree RMSE:", tree_rmse)
print("Decision Tree R-squared:", tree_r2)

Decision Tree RMSE: 3.79675975212873
Decision Tree R-squared: -0.47824081939758756


Potential advanced models: Decision Tree, Random Forest, Gradient Boosting, Bagging, and Lasso.

*Compare RMSE (lower is better) and R-squared (higher is better)*

In [55]:
comparison = pd.DataFrame({
    "Model": ["kNN (best k)", "Random Forest"],
    "RMSE": [knn_rmse, tree_rmse],
    "R_squared": [knn_r2, tree_r2]
})

comparison

Unnamed: 0,Model,RMSE,R_squared
0,kNN (best k),2.840148,0.172818
1,Random Forest,3.79676,-0.478241


Random Forest Regressor

In [57]:
from sklearn.ensemble import RandomForestRegressor

rf_model = Pipeline(steps=[
    ('prep', preprocess),
    ('model', RandomForestRegressor(
        n_estimators=300,
        random_state=42
    ))
])

rf_model.fit(X_train_adv, y_train_adv)
rf_preds = rf_model.predict(X_test_adv)

rf_mse = mean_squared_error(y_test_adv, rf_preds)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test_adv, rf_preds)

print("Random Forest RMSE:", rf_rmse)
print("Random Forest R-squared:", rf_r2)

Random Forest RMSE: 2.821211869083286
Random Forest R-squared: 0.1838117076446364
