In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, model_selection, preprocessing

## Carga de datos

In [3]:
df = pd.read_csv('C:/Users/agust/Desktop/projects/myprojects/KNN/datasets/fake_bills.csv', sep=';')
df

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,True,171.81,104.86,104.95,4.52,2.89,112.83
1,True,171.46,103.36,103.66,3.77,2.99,113.09
2,True,172.69,104.48,103.50,4.40,2.94,113.16
3,True,171.36,103.91,103.94,3.62,3.01,113.51
4,True,171.73,104.28,103.46,4.04,3.48,112.54
...,...,...,...,...,...,...,...
1495,False,171.75,104.38,104.17,4.42,3.09,111.28
1496,False,172.19,104.63,104.44,5.27,3.37,110.97
1497,False,171.80,104.01,104.12,5.51,3.36,111.95
1498,False,172.06,104.28,104.06,5.17,3.46,112.25


## Preprocesamiento de datos y analisis de valores faltantes

In [4]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB


Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length
count,1500.0,1500.0,1500.0,1463.0,1500.0,1500.0
mean,171.95844,104.029533,103.920307,4.485967,3.151473,112.6785
std,0.305195,0.299462,0.325627,0.663813,0.231813,0.87273
min,171.04,103.14,102.82,2.98,2.27,109.49
25%,171.75,103.82,103.71,4.015,2.99,112.03
50%,171.96,104.04,103.92,4.31,3.14,112.96
75%,172.17,104.23,104.15,4.87,3.31,113.34
max,173.01,104.88,104.95,6.9,3.91,114.44


In [5]:
# Comprobación de valores faltantes:

df.isnull().sum().sum()

np.int64(37)

In [6]:
#Las filas con valores faltantes:

rows_nan = df[df.isnull().any(axis=1)]
rows_nan

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
72,True,171.94,103.89,103.45,,3.25,112.79
99,True,171.93,104.07,104.18,,3.14,113.08
151,True,172.07,103.8,104.38,,3.02,112.93
197,True,171.45,103.66,103.8,,3.62,113.27
241,True,171.83,104.14,104.06,,3.02,112.36
251,True,171.8,103.26,102.82,,2.95,113.22
284,True,171.92,103.83,103.76,,3.23,113.29
334,True,171.85,103.7,103.96,,3.0,113.36
410,True,172.56,103.72,103.51,,3.12,112.95
413,True,172.3,103.66,103.5,,3.16,112.95


In [25]:
# Vemos que representa una porción muy chica del df:

print(len(rows_nan)*100/df.shape[0], '%')

# Por esta razon, decido precindir del resto de información:

df = df.dropna(how='any')

2.466666666666667 %


In [26]:
#escalado de datos:

df.iloc[:, df.columns != 'is_genuine'] = preprocessing.scale(df.iloc[:, df.columns != 'is_genuine'])
df

Unnamed: 0,is_genuine,diagonal,height_left,height_right,margin_low,margin_up,length
0,True,-0.488594,2.766807,3.173766,0.051286,-1.136982,0.177842
1,True,-1.634809,-2.241490,-0.806851,-1.078937,-0.704806,0.475692
2,True,2.393318,1.498039,-1.300571,-0.129549,-0.920894,0.555882
3,True,-1.962299,-0.405114,0.057159,-1.304982,-0.618370,0.956834
4,True,-0.750586,0.830266,-1.424001,-0.672057,1.412861,-0.154375
...,...,...,...,...,...,...,...
1495,False,-0.685088,1.164152,0.766882,-0.099410,-0.272629,-1.597801
1496,False,0.755868,1.998868,1.600034,1.181509,0.937466,-1.952929
1497,False,-0.521343,-0.071228,0.612594,1.543181,0.894249,-0.830265
1498,False,0.330131,0.830266,0.427449,1.030813,1.326425,-0.486592


## División y Cross-validation:

In [27]:
# Defino variables y target:

X = df.iloc[:, df.columns != 'is_genuine']
y = df['is_genuine']

In [28]:
# C-V del subset de entrenamiento:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

# Modelo
clf = neighbors.KNeighborsClassifier()

# Especio de hiperparemtros
pg = {'n_neighbors':range(1,31),
      'weights':['uniform', 'distance']}

# Busqueda por cuadricula de hiperparametros optimos
gs = model_selection.GridSearchCV(estimator=clf,
                               param_grid=pg,
                               scoring=None,
                               cv=None)



In [29]:
# Ajuste de busqueda:
params = gs.fit(X_train, y_train)

In [32]:
# Los mejores parametros y su relativo score:
print(params.best_params_)
print(params.best_score_)

{'n_neighbors': 8, 'weights': 'uniform'}
0.9908766233766233


## Modelado y predicciones:

In [34]:
# modelamos sabiendo que n=16:
clf = neighbors.KNeighborsClassifier(n_neighbors=params.best_params_['n_neighbors'])
clf.fit(X_train, y_train)

In [35]:
# Predicción:
y_predict = clf.predict(X_test)

#evaluación:
clf.score(X_test, y_test)  # Accuracy coincide

0.9897610921501706