In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine

In [2]:
# Load the Wine dataset
wine_data = load_wine()

# Convert to DataFrame
wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

# Bind the 'class' (wine target) to the DataFrame
wine_df['class'] = wine_data.target

# Display the DataFrame
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [3]:
num_observations = wine_df.shape[0]
print(num_observations)

178


In [4]:
num_variables = wine_df.shape[1]
print(num_variables)

14


In [5]:
response_variable_type = wine_df['class'].dtype
unique_levels = wine_df['class'].unique()
print(response_variable_type)
print(unique_levels)

int32
[0 1 2]


In [6]:
num_predictors = wine_df.shape[1] - 1  # Subtracting the response variable
print(num_predictors)

13


Why is it important to standardize the predictor variables?

Your answer here: Standardization is important because it ensures that all predictor variables are on a similar scale. This is crucial for distance-based algorithms like KNN, which are sensitive to the scale of the data. Without standardization, predictors with larger scales could dominate the distance calculations.

 Why did we elect not to standardize our response variable Class?

Your answer here: The response variable class is categorical, representing different wine types. Standardizing a categorical variable does not make sense because it does not represent a continuous scale. Instead, we want to keep the unique classes intact for classification.


In [7]:
# Set a random seed
np.random.seed(123)

In [8]:
# Select predictors (excluding the last column)
predictors = wine_df.iloc[:, :-1]

# Standardize the predictors
scaler = StandardScaler()
predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)

# Create a random vector of True and False values to split the data
split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])

# Create training and test sets for predictors
X_train = predictors_standardized[split]
X_test = predictors_standardized[~split]

# Create training and test sets for response variable
y_train = wine_df['class'][split]
y_test = wine_df['class'][~split]

In [9]:
# Initialize the KNN classifier
knn = KNeighborsClassifier()
# Define the parameter grid for n_neighbors ranging from 1 to 50
param_grid = {'n_neighbors': np.arange(1, 51)}

# Implement a grid search using GridSearchCV with 10-fold cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Identify and return the best value for n_neighbors
best_n_neighbors = grid_search.best_params_['n_neighbors']
print(best_n_neighbors)


8


In [10]:
# Fit a KNN model on the training data using the best n_neighbors
best_knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
best_knn.fit(X_train, y_train)

# Predict on the test set
y_pred = best_knn.predict(X_test)

# Evaluate the model performance using accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9473684210526315
