In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split


class KNearestNeighbors():
    def __init__(self, X_train, y_train, n_neighbors=5, weights='uniform'):

        self.X_train = X_train
        self.y_train = y_train

        self.n_neighbors = n_neighbors
        self.weights = weights

    def euclidian_distance(self, a, b):
        return np.sqrt(np.sum((a - b)**2, axis=1))

    def kneighbors(self, X_test):

        dist = []
        neigh_ind = []

        #print(X_test, self.X_train. X_test, self.X_train)
        point_dist = []
        for x_test in X_test:
            point_dist += [ self.euclidian_distance(x_test, self.X_train) ]

        for row in point_dist:
            enum_neigh = enumerate(row)
            sorted_neigh = sorted(enum_neigh,
                                  key=lambda x: x[1])[:self.n_neighbors]

            ind_list = [tup[0] for tup in sorted_neigh]
            dist_list = [tup[1] for tup in sorted_neigh]

            dist.append(dist_list)
            neigh_ind.append(ind_list)

        return np.array(neigh_ind)

    def predict(self, X_test):

        if self.weights == 'uniform':
            neighbors = self.kneighbors(X_test)
            y_pred = np.array([
                np.argmax(np.bincount(self.y_train[neighbor]))
                for neighbor in neighbors
            ])
            return y_pred

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return float(sum(y_pred == y_test)) / float(len(y_test)), y_pred, y_test

# read the data set
df_wine = pd.read_csv('datasets/winequality-red.csv')

# make sure that there are no null values that can affect the models
mask = df_wine.isnull()
#print(mask.sum())

# visualizing the values of each feature and the comparison to the output
#for col in df_wine.columns[:-1]:
    #df_wine.plot(x=col, y='quality', kind='scatter', figsize=(8, 5))
#plt.show()

"""
Data preprocessing - outlier treatment
eliminating items from outside from of the 
1.5 * Inter Quartile Range (0.125% to 0.875% of the data)
"""
l_limit_perc = 0.01
h_limit_perc = 0.99

# fixed acidity
low_limit = df_wine['fixed acidity'].quantile(l_limit_perc)
high_limit = df_wine['fixed acidity'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['fixed acidity'] >= low_limit) & (df_wine['fixed acidity'] <= high_limit)]

# volatile acidity
low_limit = df_wine['volatile acidity'].quantile(l_limit_perc)
high_limit = df_wine['volatile acidity'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['volatile acidity'] >= low_limit) & (df_wine['volatile acidity'] <= high_limit)]

# citric acid
low_limit = df_wine['citric acid'].quantile(l_limit_perc)
high_limit = df_wine['citric acid'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['citric acid'] >= low_limit) & (df_wine['citric acid'] <= high_limit)]

# residual sugar
low_limit = df_wine['residual sugar'].quantile(l_limit_perc)
high_limit = df_wine['residual sugar'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['residual sugar'] >= low_limit) & (df_wine['residual sugar'] <= high_limit)]

# chlorides
low_limit = df_wine['chlorides'].quantile(l_limit_perc)
high_limit = df_wine['chlorides'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['chlorides'] >= low_limit) & (df_wine['chlorides'] <= high_limit)]

# free sulfur dioxide
low_limit = df_wine['free sulfur dioxide'].quantile(l_limit_perc)
high_limit = df_wine['free sulfur dioxide'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['free sulfur dioxide'] >= low_limit) & (df_wine['free sulfur dioxide'] <= high_limit)]

# total sulfur dioxide
low_limit = df_wine['total sulfur dioxide'].quantile(l_limit_perc)
high_limit = df_wine['total sulfur dioxide'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['total sulfur dioxide'] >= low_limit) & (df_wine['total sulfur dioxide'] <= high_limit)]

# density
low_limit = df_wine['density'].quantile(l_limit_perc)
high_limit = df_wine['density'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['density'] >= low_limit) & (df_wine['density'] <= high_limit)]

# pH
low_limit = df_wine['pH'].quantile(l_limit_perc)
high_limit = df_wine['pH'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['pH'] >= low_limit) & (df_wine['pH'] <= high_limit)]

# sulphates
low_limit = df_wine['sulphates'].quantile(l_limit_perc)
high_limit = df_wine['sulphates'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['sulphates'] >= low_limit) & (df_wine['sulphates'] <= high_limit)]

# alcohol
low_limit = df_wine['alcohol'].quantile(l_limit_perc)
high_limit = df_wine['alcohol'].quantile(h_limit_perc)
df_wine = df_wine.loc[(df_wine['alcohol'] >= low_limit) & (df_wine['alcohol'] <= high_limit)]

# visualizing the values of each feature and the comparison to the output
#for col in df_wine.columns[:-1]:
    #df_wine.plot(x=col, y='quality', kind='scatter', figsize=(8, 5))
#plt.show()

# Feature engineering 
df_wine['alcohol'] = (df_wine['alcohol']-df_wine['alcohol'].mean())/df_wine['alcohol'].std()
df_wine['chlorides'] = (df_wine['chlorides']-df_wine['chlorides'].mean())/df_wine['chlorides'].std()
df_wine['citric acid'] = (df_wine['citric acid']-df_wine['citric acid'].mean())/df_wine['citric acid'].std()
df_wine['density'] = (df_wine['density']-df_wine['density'].mean())/df_wine['density'].std()
df_wine['fixed acidity'] = (df_wine['fixed acidity']-df_wine['fixed acidity'].mean())/df_wine['fixed acidity'].std()
df_wine['free sulfur dioxide'] = (df_wine['free sulfur dioxide']-df_wine['free sulfur dioxide'].mean())/df_wine['free sulfur dioxide'].std()
df_wine['pH'] = (df_wine['pH']-df_wine['pH'].mean())/df_wine['pH'].std()
df_wine['residual sugar'] = (df_wine['residual sugar']-df_wine['residual sugar'].mean())/df_wine['residual sugar'].std()
df_wine['sulphates'] = (df_wine['sulphates']-df_wine['sulphates'].mean())/df_wine['sulphates'].std()
df_wine['total sulfur dioxide'] = (df_wine['total sulfur dioxide']-df_wine['total sulfur dioxide'].mean())/df_wine['total sulfur dioxide'].std()
df_wine['volatile acidity'] = (df_wine['volatile acidity']-df_wine['volatile acidity'].mean())/df_wine['volatile acidity'].std()

# change the value of the output to only two values
# 0 -> bad wine, wines with 3, 4 and 5 in quality
# 1 -> good wine, wines with 6, 7 and 8 in queality
df_wine.loc[df_wine['quality'] <= 5, 'quality'] = 0
df_wine.loc[df_wine['quality'] > 5, 'quality'] = 1

df_wine.head(10)

# Define the training and test set

features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X0 = df_wine.loc[df_wine['quality'] == 0, features]
Y0 = df_wine.loc[df_wine['quality'] == 0, 'quality']

X1 = df_wine.loc[df_wine['quality'] == 1, features]
Y1 = df_wine.loc[df_wine['quality'] == 1, 'quality']

test_percentage = 0.2

X0_train, X0_test, y0_train, y0_test = train_test_split(X0, Y0, test_size=test_percentage, random_state=45)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, Y1, test_size=test_percentage, random_state=45)

X_train = pd.concat([X0_train, X1_train], axis= 0)
X_test = pd.concat([X0_test, X1_test], axis= 0)
y_train = pd.concat([y0_train, y1_train], axis= 0)
y_test = pd.concat([y0_test, y1_test], axis= 0)

if isinstance(X_train, pd.DataFrame):
    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()


k = 3
KNN_classifier = KNearestNeighbors(X_train, y_train, k)

KNN_accuracy, y_pred, y_test = KNN_classifier.score(X_test, y_test)

print ("Precisión del modelo KNN: ", KNN_accuracy)

Precisión del modelo KNN:  0.7228464419475655
