In [1]:
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import SGDRegressor
from scipy.spatial.distance import cdist
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
import numpy as np

# Classificador não supervisionado K-Means - Iris.csv

In [24]:
df = pd.read_csv("iris.csv", sep=",")
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [25]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [26]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

### Kmeans

In [28]:
X = df.drop(["species"], axis=1)
kmeans = KMeans(n_clusters = 3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
df['kmeans'] = y_kmeans
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,kmeans
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2
146,6.3,2.5,5.0,1.9,virginica,0
147,6.5,3.0,5.2,2.0,virginica,2
148,6.2,3.4,5.4,2.3,virginica,2


In [31]:
df.loc[(df.species=="virginica")&(df.kmeans==0)];

In [None]:
virginica=2 setosa=1, versicolor=0

In [39]:
df.loc[(df.species=="virginica"),"species"]=2
df.loc[(df.species=="setosa"),"species"]=1
df.loc[(df.species=="versicolor"),"species"]=0

### Acertos

In [40]:
len(df[df.species==df.kmeans])/150

0.8933333333333333

# Classificador Supervisionado

In [44]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,kmeans
0,5.1,3.5,1.4,0.2,1,1
1,4.9,3.0,1.4,0.2,1,1
2,4.7,3.2,1.3,0.2,1,1
3,4.6,3.1,1.5,0.2,1,1
4,5.0,3.6,1.4,0.2,1,1


In [45]:
df = df.drop(["kmeans"],axis=1)

In [46]:
def calc_rss(y, predicted):
    return float(((predicted - y) ** 2).sum())

def calc_r2(y, predicted):
    return r2_score(predicted,y) 

In [47]:
X = df.drop(["species"], axis=1)
y = df["species"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, shuffle=True)

In [48]:
pknn = Pipeline([('scl', StandardScaler()), ('rgs', KNeighborsRegressor(n_neighbors=5))])
ptree = Pipeline([('scl', StandardScaler()), ('rgs', DecisionTreeRegressor())])
psgdr = Pipeline([('scl', StandardScaler()), ('rgs', SGDRegressor())])

pipelines = [pknn, ptree, psgdr]

pipe_names = {0:"KNN", 1:"Árvore de Decisão", 2:"SGDR"}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

print("    Test Accuracy\n")
scores = []
for idx, val in enumerate(pipelines):
    sc = val.score(X_test, y_test)
    scores.append(sc)
    print("%s: %.3f" % (pipe_names[idx], sc))
    
max_ind = scores.index(max(scores))
print("\nRegression with best accuracy: ", pipe_names[max_ind])

    Test Accuracy

KNN: 0.743
Árvore de Decisão: 0.714
SGDR: 0.229

Regression with best accuracy:  KNN


In [49]:
pred_test = pipelines[max_ind].predict(X_test)

print("RSS = ", calc_rss(y_test, pred_test))
print("R2 = ", calc_r2(y_test, pred_test))

RSS =  7.200000000000001
R2 =  0.6288489736070381
