In [1]:
import random

import altair as alt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import set_config
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [2]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo 
# fetch dataset 
htru2 = fetch_ucirepo(id=372) 
  
# data (as pandas dataframes) 
X = htru2.data.features 
y = htru2.data.targets 
  
# metadata 
print(htru2.metadata) 
  
# variable information 
print(htru2.variables) 

{'uci_id': 372, 'name': 'HTRU2', 'repository_url': 'https://archive.ics.uci.edu/dataset/372/htru2', 'data_url': 'https://archive.ics.uci.edu/static/public/372/data.csv', 'abstract': 'Pulsar candidates collected during the HTRU survey. Pulsars are a type of star, of considerable scientific interest. Candidates must be classified in to pulsar and non-pulsar classes to aid discovery.', 'area': 'Physics and Chemistry', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 17898, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Wed Apr 03 2024', 'dataset_doi': '10.24432/C5DK6R', 'creators': ['Robert Lyon'], 'intro_paper': {'title': 'Fifty years of pulsar candidate selection: from simple filters to a new principled real-time classification approach', 'authors': 'R. Lyon, B. Stapper

In [4]:
star_data = pd.merge(X, y, left_index=True,right_index=True)
star_data

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis,class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [5]:
np.random.seed(2024)

star_train, star_test = train_test_split(
    star_data, train_size=0.80,
)
star_train

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis,class
14117,89.726562,42.207026,0.414897,0.904615,2.403846,13.488690,9.129472,121.734406,0
3787,116.960938,73.153990,0.316964,-1.166485,201.534281,47.262251,-1.704449,2.788871,0
765,136.664062,52.628307,0.067857,-0.492523,4.382107,24.805369,6.734455,49.054326,0
9684,116.046875,43.673837,0.330900,0.308270,3.107023,20.456870,7.857742,69.309930,0
8149,118.843750,48.989865,0.335433,0.134276,0.775084,11.946176,17.115458,312.061142,0
...,...,...,...,...,...,...,...,...,...
16567,128.125000,48.357555,-0.005863,-0.100869,2.019231,15.440807,10.652478,137.918490,0
2494,135.554688,41.715706,-0.047587,0.910534,2.617893,17.409786,8.831467,89.780556,0
14875,124.976562,45.683946,-0.209657,0.109942,4.981605,26.593647,5.563959,31.319089,0
2688,125.304688,49.947873,0.053109,-0.068939,1.628763,12.247147,12.262394,195.921439,0


In [6]:
star_preprocessor = make_column_transformer(
    (StandardScaler(), ["Profile_mean", "Profile_stdev", "Profile_skewness"
                        ,"Profile_kurtosis", "DM_mean", "DM_stdev", "DM_skewness"
                        ,"DM_kurtosis"]),
     verbose_feature_names_out=False
)
star_preprocessor

In [7]:
param_grid = {
    "kneighborsclassifier__n_neighbors": range(2, 15, 1),
}
star_pipe = make_pipeline(star_preprocessor, KNeighborsClassifier())

In [8]:
knn_star_grid = GridSearchCV(
    estimator=star_pipe, 
    param_grid=param_grid, 
    cv=5,
    n_jobs=-1,
    return_train_score=True
)
knn_star_grid

In [9]:
X = star_train.drop(columns=["class"])
y = star_train["class"]

knn_model_grid = knn_star_grid.fit(X, y)

accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)
accuracies_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.014046,0.00094,0.139812,0.0069,2,{'kneighborsclassifier__n_neighbors': 2},0.975908,0.974511,0.978352,0.975899,...,0.975346,0.002058,13,0.984983,0.984721,0.983761,0.983763,0.985072,0.98446,0.000582
1,0.01347,3.8e-05,0.141325,0.004249,3,{'kneighborsclassifier__n_neighbors': 3},0.97905,0.976257,0.979749,0.977995,...,0.977441,0.002022,8,0.983325,0.98446,0.98315,0.983501,0.984286,0.983744,0.000528
2,0.013593,0.000192,0.142344,0.003425,4,{'kneighborsclassifier__n_neighbors': 4},0.978003,0.976257,0.976955,0.977297,...,0.976812,0.000845,12,0.980967,0.980793,0.980094,0.980533,0.982191,0.980916,0.000702
3,0.013663,0.000475,0.144449,0.004101,5,{'kneighborsclassifier__n_neighbors': 5},0.97905,0.976257,0.97905,0.977646,...,0.97772,0.001178,3,0.981404,0.981578,0.981142,0.981144,0.982191,0.981492,0.000387
4,0.013484,0.000113,0.148571,0.004949,6,{'kneighborsclassifier__n_neighbors': 6},0.978701,0.97486,0.977654,0.977297,...,0.977162,0.001261,10,0.980007,0.980444,0.979396,0.979572,0.980358,0.979955,0.000415
5,0.013538,0.000157,0.148259,0.004146,7,{'kneighborsclassifier__n_neighbors': 7},0.980098,0.975559,0.978352,0.977646,...,0.977511,0.001663,7,0.979745,0.980793,0.979483,0.979398,0.980882,0.98006,0.000645
6,0.013471,0.000142,0.14842,0.004103,8,{'kneighborsclassifier__n_neighbors': 8},0.978352,0.97486,0.977654,0.978694,...,0.977022,0.001536,11,0.978959,0.980007,0.978697,0.978961,0.979572,0.979239,0.00048
7,0.013792,0.000407,0.150735,0.00433,9,{'kneighborsclassifier__n_neighbors': 9},0.978003,0.976606,0.97905,0.978344,...,0.97765,0.00106,4,0.979134,0.979832,0.978785,0.97931,0.979398,0.979292,0.000342
8,0.013232,0.000102,0.14757,0.004766,10,{'kneighborsclassifier__n_neighbors': 10},0.978352,0.975908,0.97905,0.977995,...,0.977581,0.001157,5,0.97861,0.979396,0.978086,0.978787,0.978437,0.978663,0.000433
9,0.01327,6.1e-05,0.149085,0.003169,11,{'kneighborsclassifier__n_neighbors': 11},0.978352,0.976257,0.97905,0.978344,...,0.97772,0.001092,2,0.978697,0.979396,0.978086,0.979136,0.978699,0.978803,0.000447


In [10]:
accuracy_versus_k_grid = alt.Chart(accuracies_grid).mark_line(point=True).encode(
     x=alt.X("param_kneighborsclassifier__n_neighbors")
         .title("Neighbors")
         .scale(zero=False),
     y=alt.Y("mean_test_score")
         .title("Accuracy estimate")
         .scale(zero=False)
 ).properties(
    width=800
)


# your code here
#raise NotImplementedError
accuracy_versus_k_grid