In [18]:
%pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import numpy as np
import pandas as pd

df = pd.read_csv("data_manual_v2.csv")
df = df.drop(["Tijdstempel"], axis=1)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   programme          305 non-null    object 
 1   course_ML          305 non-null    int64  
 2   course_retrieval   305 non-null    int64  
 3   course_statistics  305 non-null    int64  
 4   course_databases   305 non-null    int64  
 5   gender             305 non-null    int64  
 6   ChatGPT            305 non-null    int64  
 7   birthday           305 non-null    object 
 8   students_in_room   305 non-null    int64  
 9   prev_answer_stand  305 non-null    int64  
 10  stress_level       305 non-null    int64  
 11  sports_per_week    305 non-null    int64  
 12  random_number      305 non-null    float64
 13  bed_time           305 non-null    object 
 14  good_day_(1)       305 non-null    int64  
 15  good_day_(2)       305 non-null    int64  
dtypes: float64(1), int64(12), 

# Transformations

- Transformations done manually
    - Grouped together labels of last two columns into labels: Work (0), Recreation (1), Health (2), Social (3), Weather (4), Food (5), Other (6)
    - Converted random numbers that were strings into numbers (if applicable)
        - Otherwise, picked a random integer in [0, 100]

### 1. Converting birthday to age

In [21]:
from datetime import date

ages = []
for i, row in df.iterrows():
    parts = row["birthday"].split("-")
    birthday = date(int(parts[0]), int(parts[1]), int(parts[2]))
    now = date.today()
    age = now.year - birthday.year - ((now.month, now.day) < (birthday.month, birthday.day))
    ages.append(age)
df["birthday"] = ages
df.rename(columns={"birthday": "age"}, inplace=True)
df

Unnamed: 0,programme,course_ML,course_retrieval,course_statistics,course_databases,gender,ChatGPT,age,students_in_room,prev_answer_stand,stress_level,sports_per_week,random_number,bed_time,good_day_(1),good_day_(2)
0,AI,1,0,1,0,1,8,23,100,0,50,1,1.0,4:30:00,0,3
1,AI,1,0,1,0,5,1,25,319,0,100,0,69.0,23:30:00,6,6
2,AI,0,0,0,0,1,8,25,600,0,60,2,42.0,23:00:00,5,2
3,AI,0,0,1,0,2,0,25,220,0,20,5,17.0,23:30:00,1,3
4,Data Mining,1,1,1,1,1,1,26,100,0,50,5,12.0,23:00:00,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,CS,1,1,0,1,1,8,22,200,0,70,10,67.0,1:00:00,5,4
301,Exch,0,0,0,1,1,0,26,400,0,70,0,13.0,1:09:00,0,0
302,CS,1,0,1,0,2,0,26,150,0,10,5,18.0,4:00:00,3,0
303,AI,1,0,1,1,5,0,26,150,1,10,5,69.0,4:00:00,6,6


### 2. Convert bedtime to seconds

In [22]:
bedtimes = []
for i, row in df.iterrows():
    parts = row["bed_time"].split(":")
    bedtime = int(parts[0])*3600 + int(parts[1])*60 + int(parts[2])
    bedtimes.append(bedtime)
df["bed_time"] = bedtimes
df

Unnamed: 0,programme,course_ML,course_retrieval,course_statistics,course_databases,gender,ChatGPT,age,students_in_room,prev_answer_stand,stress_level,sports_per_week,random_number,bed_time,good_day_(1),good_day_(2)
0,AI,1,0,1,0,1,8,23,100,0,50,1,1.0,16200,0,3
1,AI,1,0,1,0,5,1,25,319,0,100,0,69.0,84600,6,6
2,AI,0,0,0,0,1,8,25,600,0,60,2,42.0,82800,5,2
3,AI,0,0,1,0,2,0,25,220,0,20,5,17.0,84600,1,3
4,Data Mining,1,1,1,1,1,1,26,100,0,50,5,12.0,82800,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,CS,1,1,0,1,1,8,22,200,0,70,10,67.0,3600,5,4
301,Exch,0,0,0,1,1,0,26,400,0,70,0,13.0,4140,0,0
302,CS,1,0,1,0,2,0,26,150,0,10,5,18.0,14400,3,0
303,AI,1,0,1,1,5,0,26,150,1,10,5,69.0,14400,6,6


###  3. Normalize numerical features

In [23]:
def normalize_column(df, names):
    for name in names:
        df[name] = (df[name] - df[name].min()) / (df[name].max() - df[name].min())

def standardize_column(df, names):
    for name in names:
        df[name] = (df[name] - df[name].mean()) / (df[name].std())
        
normalize_column(df, ["age", "students_in_room", "stress_level", "sports_per_week", "random_number", "bed_time"])
df

Unnamed: 0,programme,course_ML,course_retrieval,course_statistics,course_databases,gender,ChatGPT,age,students_in_room,prev_answer_stand,stress_level,sports_per_week,random_number,bed_time,good_day_(1),good_day_(2)
0,AI,1,0,1,0,1,8,0.142857,0.079402,0,0.5,0.018868,2.365812e-203,0.187630,0,3
1,AI,1,0,1,0,5,1,0.238095,0.331415,0,1.0,0.000000,4.122457e-202,0.979847,6,6
2,AI,0,0,0,0,1,8,0.238095,0.654776,0,0.6,0.037736,2.579536e-202,0.958999,5,2
3,AI,0,0,1,0,2,0,0.238095,0.217491,0,0.2,0.094340,1.150905e-202,0.979847,1,3
4,Data Mining,1,1,1,1,1,1,0.285714,0.079402,0,0.5,0.094340,8.651787e-203,0.958999,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,CS,1,1,0,1,1,8,0.095238,0.194476,0,0.7,0.188679,4.008166e-202,0.041696,5,4
301,Exch,0,0,0,1,1,0,0.285714,0.424626,0,0.7,0.000000,9.223240e-203,0.047950,0,0
302,CS,1,0,1,0,2,0,0.285714,0.136939,0,0.1,0.094340,1.208050e-202,0.166782,3,0
303,AI,1,0,1,1,5,0,0.285714,0.136939,1,0.1,0.094340,4.122457e-202,0.166782,6,6


# Feature Selection

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV

label = df["programme"]
features = df.loc[:, df.columns != "programme"]
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.3)
print(x_train.info())

sel = SelectFromModel(RandomForestClassifier(n_estimators=1000))
sel.fit(x_train, y_train)

selected_features = x_train.columns[(sel.get_support())]
x_train = x_train[selected_features]
x_test = x_test[selected_features]

print(sel.estimator_.feature_importances_.ravel())

def one_hot_encode(df, col_name):
    one_hot = pd.get_dummies(df[col_name], prefix=col_name, prefix_sep="_")
    df = df.drop(col_name, axis=1)
    df = df.join(one_hot)
    return df

x_train = one_hot_encode(x_train, "good_day_(1)")
x_train = one_hot_encode(x_train, "good_day_(2)")
x_test = one_hot_encode(x_test, "good_day_(1)")
x_test = one_hot_encode(x_test, "good_day_(2)")

x_train

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213 entries, 304 to 33
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   course_ML          213 non-null    int64  
 1   course_retrieval   213 non-null    int64  
 2   course_statistics  213 non-null    int64  
 3   course_databases   213 non-null    int64  
 4   gender             213 non-null    int64  
 5   ChatGPT            213 non-null    int64  
 6   age                213 non-null    float64
 7   students_in_room   213 non-null    float64
 8   prev_answer_stand  213 non-null    int64  
 9   stress_level       213 non-null    float64
 10  sports_per_week    213 non-null    float64
 11  random_number      213 non-null    float64
 12  bed_time           213 non-null    float64
 13  good_day_(1)       213 non-null    int64  
 14  good_day_(2)       213 non-null    int64  
dtypes: float64(6), int64(9)
memory usage: 26.6 KB
None
[0.03144201 0.03166581

Unnamed: 0,age,students_in_room,stress_level,sports_per_week,bed_time,good_day_(1)_0,good_day_(1)_1,good_day_(1)_2,good_day_(1)_3,good_day_(1)_4,good_day_(1)_5,good_day_(1)_6,good_day_(2)_0,good_day_(2)_1,good_day_(2)_2,good_day_(2)_3,good_day_(2)_4,good_day_(2)_5,good_day_(2)_6
304,0.285714,0.194476,0.80,0.056604,0.917304,0,0,0,0,0,1,0,0,1,0,0,0,0,0
136,0.095238,0.489068,0.85,0.056604,0.979847,0,0,0,0,1,0,0,0,0,1,0,0,0,0
36,0.238095,0.424626,0.80,0.056604,0.000000,0,0,0,1,0,0,0,0,0,0,1,0,0,0
65,0.095238,0.194476,0.10,0.037736,0.000000,0,0,0,0,1,0,0,0,0,0,1,0,0,0
103,0.238095,0.424626,0.70,0.188679,0.952050,0,0,0,0,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,0.047619,0.367089,0.95,0.075472,0.979847,0,1,0,0,0,0,0,1,0,0,0,0,0,0
6,0.142857,0.539701,0.80,0.094340,0.000000,0,0,0,0,0,1,0,0,0,0,0,1,0,0
187,0.142857,0.309551,0.05,0.188679,0.958999,0,1,0,0,0,0,0,0,0,0,0,1,0,0
38,0.190476,0.539701,0.50,0.075472,0.979847,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [87]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Create KNN model
knn = KNeighborsClassifier()

# Define the hyperparameters to be tuned
param_grid = {
    'n_neighbors': range(1, 20),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Print the best hyperparameters and their corresponding accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy score:", grid_search.best_score_)
    
# Evaluate the performance on the test set using the best hyperparameters
y_pred = grid_search.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", accuracy)



Best hyperparameters: {'n_neighbors': 15, 'p': 2, 'weights': 'uniform'}
Accuracy score: 0.436766334440753
Test set accuracy: 0.41304347826086957
