# EXERCÍCIO AULA
Inspire-se na secção “Selecting Best Models When Preprocessing” da teórica
e, utilizando o dataset red wine:
1. Descubra a melhor forma de pré-processamento, entre o PCA (com o
parâmetro n_components) e o KernelPCA com o kernel rbf (com os
parâmetros gamma e n_components)
2. Descubra qual é o melhor algoritmo (entre logistic regression e knn) e
sua respetiva parametrização
3. Implemente o algoritmo knn em Python
4. Submeta no blackboard o link do notebook Google-Colab com a
solução do seu grupo

## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA,KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Download Dataset

In [2]:
# Set random seed
np.random.seed(0)

# Load data
red_wine = pd.read_csv('winequality-red.csv',sep=';')

red_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
features = red_wine.drop(['quality'], axis=1)
target = red_wine['quality']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(features,target,test_size=0.2)

## Problema 1
### PCA - Logistic Regression

In [5]:
# Create a preprocessing object that includes StandardScaler features and PCA
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

# Create a pipeline
pipe = Pipeline([("preprocess", preprocess),
("classifier", LogisticRegression(solver = "liblinear", max_iter=500))])

# Create space of candidate values
search_space = [{"preprocess__pca__n_components": [1, 2, 3],               
  "classifier__penalty": ["l1", "l2"],
  "classifier__C": np.logspace(0, 4, 10)}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# Fit grid search
best_modelPCA = clf.fit(x_train, y_train)
best_modelPCA.best_estimator_.get_params()

{'memory': None,
 'steps': [('preprocess',
   FeatureUnion(transformer_list=[('std', StandardScaler()),
                                  ('pca', PCA(n_components=3))])),
  ('classifier', LogisticRegression(max_iter=500, solver='liblinear'))],
 'verbose': False,
 'preprocess': FeatureUnion(transformer_list=[('std', StandardScaler()),
                                ('pca', PCA(n_components=3))]),
 'classifier': LogisticRegression(max_iter=500, solver='liblinear'),
 'preprocess__n_jobs': None,
 'preprocess__transformer_list': [('std', StandardScaler()),
  ('pca', PCA(n_components=3))],
 'preprocess__transformer_weights': None,
 'preprocess__verbose': False,
 'preprocess__std': StandardScaler(),
 'preprocess__pca': PCA(n_components=3),
 'preprocess__std__copy': True,
 'preprocess__std__with_mean': True,
 'preprocess__std__with_std': True,
 'preprocess__pca__copy': True,
 'preprocess__pca__iterated_power': 'auto',
 'preprocess__pca__n_components': 3,
 'preprocess__pca__random_state': None

In [6]:
# View best hyperparameters
print('Best Number Of Princpal Components:', best_modelPCA.best_estimator_.get_params()['preprocess__pca__n_components'])
print('Best Penalty:', best_modelPCA.best_estimator_.get_params()['classifier__penalty'])
print('Best C:', best_modelPCA.best_estimator_.get_params()['classifier__C'])

Best Number Of Princpal Components: 3
Best Penalty: l2
Best C: 1.0


In [7]:
predictionsPCA = best_modelPCA.predict(x_test)
confusion_matrix(y_test, predictionsPCA)

### KernelPCA - Logistic Regression

In [8]:
# Create a preprocessing object that includes StandardScaler features and KernelPCA
preprocess = FeatureUnion([("std", StandardScaler()), ("kernel_pca", KernelPCA(kernel="rbf"))])

pipe = Pipeline([
    ("preprocess", preprocess),
    ("classifier", LogisticRegression(solver = "liblinear", max_iter=500))
    ])

# Create space of candidate values
search_space = [{"preprocess__kernel_pca__n_components": [1, 2, 3],
  "preprocess__kernel_pca__gamma": np.linspace(0.03, 0.05, 10),
  "classifier__penalty": ["l1", "l2"],
  "classifier__C": np.logspace(0, 4, 10)}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# Fit grid search
best_modelKPCA = clf.fit(x_train, y_train)
best_modelKPCA.best_estimator_.get_params()

{'memory': None,
 'steps': [('preprocess',
   FeatureUnion(transformer_list=[('std', StandardScaler()),
                                  ('kernel_pca',
                                   KernelPCA(gamma=0.05, kernel='rbf',
                                             n_components=1))])),
  ('classifier',
   LogisticRegression(C=59.94842503189409, max_iter=500, penalty='l1',
                      solver='liblinear'))],
 'verbose': False,
 'preprocess': FeatureUnion(transformer_list=[('std', StandardScaler()),
                                ('kernel_pca',
                                 KernelPCA(gamma=0.05, kernel='rbf',
                                           n_components=1))]),
 'classifier': LogisticRegression(C=59.94842503189409, max_iter=500, penalty='l1',
                    solver='liblinear'),
 'preprocess__n_jobs': None,
 'preprocess__transformer_list': [('std', StandardScaler()),
  ('kernel_pca', KernelPCA(gamma=0.05, kernel='rbf', n_components=1))],
 'preprocess__transf

In [9]:
print('Best Number Of Princpal Components:', best_modelKPCA.best_estimator_.get_params()['preprocess__kernel_pca__n_components'])
print('Best Value of gamma:', best_modelKPCA.best_estimator_.get_params()['preprocess__kernel_pca__gamma'])
print('Best Penalty:', best_modelKPCA.best_estimator_.get_params()['classifier__penalty'])
print('Best C:', best_modelKPCA.best_estimator_.get_params()['classifier__C'])

Best Number Of Princpal Components: 1
Best Value of gamma: 0.05
Best Penalty: l1
Best C: 59.94842503189409


In [10]:
predictionsKPCA = best_modelKPCA.predict(x_test)
confusion_matrix(y_test, predictionsKPCA)

## Problema 2
### PCA - K Nearest Neighbors

In [11]:

# Create a preprocessing object that includes StandardScaler features and PCA
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

# Create a pipeline
pipe = Pipeline([("preprocess", preprocess),
("classifier", KNeighborsClassifier(algorithm = "auto", leaf_size=50))])

# Create space of candidate values
search_space = [{"preprocess__pca__n_components": [1, 2, 3],               
  "classifier__n_neighbors": [2,4,8], #np.logspace(0, 4, 10),
  "classifier__weights": ['uniform', 'distance']}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# Fit grid search
best_modelPCA = clf.fit(x_train, y_train)
best_modelPCA.best_estimator_.get_params()

{'memory': None,
 'steps': [('preprocess',
   FeatureUnion(transformer_list=[('std', StandardScaler()),
                                  ('pca', PCA(n_components=1))])),
  ('classifier',
   KNeighborsClassifier(leaf_size=50, n_neighbors=8, weights='distance'))],
 'verbose': False,
 'preprocess': FeatureUnion(transformer_list=[('std', StandardScaler()),
                                ('pca', PCA(n_components=1))]),
 'classifier': KNeighborsClassifier(leaf_size=50, n_neighbors=8, weights='distance'),
 'preprocess__n_jobs': None,
 'preprocess__transformer_list': [('std', StandardScaler()),
  ('pca', PCA(n_components=1))],
 'preprocess__transformer_weights': None,
 'preprocess__verbose': False,
 'preprocess__std': StandardScaler(),
 'preprocess__pca': PCA(n_components=1),
 'preprocess__std__copy': True,
 'preprocess__std__with_mean': True,
 'preprocess__std__with_std': True,
 'preprocess__pca__copy': True,
 'preprocess__pca__iterated_power': 'auto',
 'preprocess__pca__n_components': 1,
 

In [12]:
# View best hyperparameters
print('Best Number Of Princpal Components:', best_modelPCA.best_estimator_.get_params()['preprocess__pca__n_components'])
print('Best Penalty:', best_modelPCA.best_estimator_.get_params()['classifier__n_neighbors'])
print('Best C:', best_modelPCA.best_estimator_.get_params()['classifier__weights'])

Best Number Of Princpal Components: 1
Best Penalty: 8
Best C: distance


In [None]:
predictionsPCA = best_modelPCA.predict(x_test)
confusion_matrix(y_test, predictionsPCA)

### Kernel PCA - K Nearest Neighbors

In [13]:
# Create a preprocessing object that includes StandardScaler features and KernelPCA
preprocess = FeatureUnion([("std", StandardScaler()), ("kernel_pca", KernelPCA(kernel="rbf"))])

pipe = Pipeline([
    ("preprocess", preprocess),
    ("classifier", KNeighborsClassifier(algorithm = "auto", leaf_size=50))
    ])

# Create space of candidate values
search_space = [{"preprocess__kernel_pca__n_components": [1, 2, 3],
  "preprocess__kernel_pca__gamma": np.linspace(0.03, 0.05, 10),
  "classifier__n_neighbors": [2,4,8], #np.logspace(0, 4, 10),
  "classifier__weights": ['uniform', 'distance']}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# Fit grid search
best_modelKPCA = clf.fit(x_train, y_train)
best_modelKPCA.best_estimator_.get_params()

{'memory': None,
 'steps': [('preprocess',
   FeatureUnion(transformer_list=[('std', StandardScaler()),
                                  ('kernel_pca',
                                   KernelPCA(gamma=0.05, kernel='rbf',
                                             n_components=1))])),
  ('classifier',
   KNeighborsClassifier(leaf_size=50, n_neighbors=8, weights='distance'))],
 'verbose': False,
 'preprocess': FeatureUnion(transformer_list=[('std', StandardScaler()),
                                ('kernel_pca',
                                 KernelPCA(gamma=0.05, kernel='rbf',
                                           n_components=1))]),
 'classifier': KNeighborsClassifier(leaf_size=50, n_neighbors=8, weights='distance'),
 'preprocess__n_jobs': None,
 'preprocess__transformer_list': [('std', StandardScaler()),
  ('kernel_pca', KernelPCA(gamma=0.05, kernel='rbf', n_components=1))],
 'preprocess__transformer_weights': None,
 'preprocess__verbose': False,
 'preprocess__std': Stand

In [14]:
print('Best Number Of Princpal Components:', best_modelKPCA.best_estimator_.get_params()['preprocess__kernel_pca__n_components'])
print('Best Value of gamma:', best_modelKPCA.best_estimator_.get_params()['preprocess__kernel_pca__gamma'])
print('Best Penalty:', best_modelKPCA.best_estimator_.get_params()['classifier__n_neighbors'])
print('Best C:', best_modelKPCA.best_estimator_.get_params()['classifier__weights'])

Best Number Of Princpal Components: 1
Best Value of gamma: 0.05
Best Penalty: 8
Best C: distance


In [None]:
predictionsKPCA = best_modelKPCA.predict(x_test)
confusion_matrix(y_test, predictionsKPCA)

## Problema 3 
### Implementação do algoritmo KNN

In [15]:
# Example of getting neighbors for an instance
from math import sqrt

# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)


# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = euclidean_distance(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors


# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
	neighbors = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction


neighbors = get_neighbors(red_wine.to_numpy(), red_wine.to_numpy()[151], 5)
for neighbor in neighbors:
	print(neighbor)

prediction = predict_classification(red_wine.to_numpy(), red_wine.to_numpy()[151], 5)
prediction

# Tentou prever a qualidade do vinho do registo zero
print('\n Expected %d, Got %d.' % (red_wine.to_numpy()[151][-1], prediction)) 

[ 9.2     0.52    1.      3.4     0.61   32.     69.      0.9996  2.74
  2.      9.4     4.    ]
[9.90000e+00 5.90000e-01 7.00000e-02 3.40000e+00 1.02000e-01 3.20000e+01
 7.10000e+01 1.00015e+00 3.31000e+00 7.10000e-01 9.80000e+00 5.00000e+00]
[9.90000e+00 5.90000e-01 7.00000e-02 3.40000e+00 1.02000e-01 3.20000e+01
 7.10000e+01 1.00015e+00 3.31000e+00 7.10000e-01 9.80000e+00 5.00000e+00]
[ 8.8      0.61     0.19     4.       0.094   30.      69.       0.99787
  3.22     0.5     10.       6.     ]
[ 8.8      0.61     0.19     4.       0.094   30.      69.       0.99787
  3.22     0.5     10.       6.     ]

 Expected 4, Got 5.


In [16]:
red_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
