In [1]:
import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler, LabelEncoder

#import graphviz

  from numpy.core.umath_tests import inner1d


In [2]:
#Transformando a coluna de sexo de testo pra numero e considerando ela como dado categorico
df = pd.read_csv("abalone_dataset.csv")
#df['sex'] = LabelEncoder().fit_transform(df['sex'].tolist())
#df['sex'] = df['sex'].astype('category')
df = df.drop(columns = "sex")

In [3]:
# Preprocessamento da base
preps = [MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler]
# Modelos a serem testados
models = [SVC,LogisticRegression,MLPClassifier,RandomForestClassifier,DecisionTreeClassifier]
# Pipeline para testar todos os modelos com todos os preprocessamento
pipes = [make_pipeline(prepo(),model()) for model in models for prepo in preps]

In [4]:
results = []
for pipe in pipes:
    res = np.median(cross_validate(pipe,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10)["test_score"])
    results.append(np.append(np.array(pipe.steps)[:,0],res))

In [5]:
df.dtypes

length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
type                int64
dtype: object

In [6]:
pd.DataFrame(results,columns=["Preprocessing","Model","Median-Accuracy"]).sort_values(by="Median-Accuracy",ascending=False)

Unnamed: 0,Preprocessing,Model,Median-Accuracy
10,standardscaler,mlpclassifier,0.660809
2,standardscaler,svc,0.656047
6,standardscaler,logisticregression,0.656007
8,minmaxscaler,mlpclassifier,0.646497
9,normalizer,mlpclassifier,0.645367
11,maxabsscaler,mlpclassifier,0.642173
13,normalizer,randomforestclassifier,0.635769
4,minmaxscaler,logisticregression,0.634185
7,maxabsscaler,logisticregression,0.632588
5,normalizer,logisticregression,0.632177


# Pegamos o melhor modelo e preprocessamento, para testar no GridSearch

In [7]:
# Pipeline para o grid search
pipe = make_pipeline(StandardScaler(),LogisticRegression())
# Dicionario de parametros a serem testados pelo grid search
logparameters = {'logisticregression__penalty':['l2'], 'logisticregression__solver':('newton-cg', 'lbfgs', 'sag', 'saga'), 'logisticregression__C': (np.arange(10,100,10)), 'logisticregression__multi_class':['multinomial'], 'logisticregression__max_iter':[1000]}
mlpparameters = {'mlpclassifier__activation':['identity','logistic','tanh','relu'], 'mlpclassifier__solver':('adam', 'lbfgs','sgd'), 'mlpclassifier__alpha':(np.arange(0.1,1,0.1)), 'mlpclassifier__learning_rate':('constant','invscaling','adaptive'), 'mlpclassifier__max_iter':(np.arange(100,1000,100))}

In [8]:
# GridSearch com cros validation, testa o modelo com todas as combinações de parametros passadas no dicionario,
# e classifica a melhor de acordo com uma metrica que escolhermos, nesse caso a acuracia.
clf = GridSearchCV(pipe,logparameters,scoring="accuracy", cv=10, n_jobs=-1)

In [9]:
%%time
#clf.fit(df.drop(columns="type"),df["type"])

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 12.4 µs


# Seleção de atributos recursivamente
- selecionamos a melhor combinação de hiperparametros do modelo com o grid search
- aplicamos a seleção de atributos nesse modelo

In [10]:
#log = clf.best_estimator_.steps[1][1]

In [11]:
#seletor = RFECV(log, cv=10, scoring='accuracy')

In [12]:
#seletor.fit(df.drop(columns="type"),df["type"])

In [13]:
#Atributos selecionados
#df.drop(columns="type").columns[seletor.get_support()]

In [14]:
# plt.figure()
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross validation score (accuracy)")
# plt.plot(range(1, len(seletor.grid_scores_) + 1), seletor.grid_scores_)
# plt.show()

In [15]:
# Gerando vetor resposta pra enviar ao servidor

In [16]:
mlp = MLPClassifier(activation='tanh', alpha=0.30000000000000004,hidden_layer_sizes=(16,), learning_rate='invscaling',learning_rate_init=0.001, max_iter=100, solver='lbfgs')

In [17]:
bla = cross_validate(mlp,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10, n_jobs=-1)

In [18]:
bla



{'fit_time': array([0.77127862, 0.82356572, 0.7920053 , 0.82797742, 0.81627989,
        0.66173005, 0.6902442 , 0.76810503, 0.42508197, 0.43153501]),
 'score_time': array([0.00817609, 0.00139642, 0.00130033, 0.01150179, 0.00128698,
        0.00120401, 0.00122333, 0.00118756, 0.00121856, 0.00082231]),
 'test_score': array([0.66666667, 0.64649682, 0.69745223, 0.66453674, 0.68690096,
        0.65495208, 0.64217252, 0.61022364, 0.68589744, 0.68269231]),
 'train_score': array([0.66205183, 0.6682044 , 0.65649397, 0.66690316, 0.66193686,
        0.66796736, 0.66938631, 0.67577155, 0.65957447, 0.66276596])}

In [19]:
df = pd.read_csv("abalone_dataset.csv")
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
0,2,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,3
1,1,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,1
2,1,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,1
3,2,0.55,0.45,0.17,0.81,0.317,0.157,0.22,3
4,1,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,1


In [20]:
df.dtypes

sex                 int64
length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
type                int64
dtype: object

In [21]:
df = df.drop(columns="sex")
type = df["type"]

In [22]:
scaler = MinMaxScaler(feature_range=(0.1,0.9))
dfscaled = scaler.fit_transform(df.drop(columns="type"), df["type"])
dfscaled = pd.DataFrame(dfscaled, columns=df.columns[:-1])

In [23]:
dfscaled = pd.merge(dfscaled,pd.DataFrame(type),right_index=True,left_index=True)

In [24]:
mlp.fit(dfscaled.drop(columns="type"),dfscaled["type"])
teste = pd.read_csv("abalone_app.csv")
#teste['sex'] = LabelEncoder().fit_transform(teste['sex'].tolist())
#teste['sex'] = teste['sex'].astype('category')
teste = teste.drop(columns="sex")
testescaled = scaler.fit_transform(teste)
testescaled = pd.DataFrame(testescaled,columns=teste.columns)
pd.Series(mlp.predict(testescaled)).to_csv("respostas.csv",index=False)

In [25]:
dfscaled.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,0.582586,0.571752,0.314778,0.331411,0.290665,0.287333,0.28664,1.991379
std,0.130547,0.133915,0.060903,0.13871,0.119152,0.115395,0.110983,0.824561
min,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.0
25%,0.505405,0.489916,0.270874,0.223074,0.197108,0.195194,0.200448,1.0
50%,0.602703,0.590756,0.317476,0.322419,0.27727,0.276432,0.278176,2.0
75%,0.678378,0.671429,0.356311,0.422897,0.367115,0.362936,0.356801,3.0
max,0.9,0.9,0.9,0.9,0.9,0.9,0.9,3.0


In [26]:
testescaled.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
count,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0
mean,0.591616,0.575028,0.188845,0.365109,0.354826,0.368113,0.321333
std,0.155763,0.158308,0.035081,0.155107,0.156292,0.160694,0.126171
min,0.1,0.1,0.1,0.1,0.1,0.1,0.1
25%,0.496694,0.479798,0.172072,0.240954,0.230998,0.241522,0.219545
50%,0.615702,0.60101,0.19009,0.355893,0.337478,0.355179,0.318182
75%,0.708264,0.689899,0.208108,0.467521,0.458319,0.475435,0.407273
max,0.9,0.9,0.9,0.9,0.9,0.9,0.9


In [27]:
y_pred = pd.read_csv("respostas.csv").values

In [28]:
y_pred

array([[1],
       [2],
       [2],
       ...,
       [1],
       [1],
       [2]])

In [29]:
import requests

print('\n - Lendo o arquivo com o dataset sobre abalone')

# abalone = pd.read_csv('abalone_min_max.csv')

# # Criando X and y par ao algorítmo de aprendizagem de máquina.
# print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo')
# X,Y = abalone[abalone.columns[:-1]],abalone[abalone.columns[-1]]
# Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, stratify = Y, random_state=66, test_size=0.10)

# # Ciando o modelo preditivo para a base trabalhada
# print(' - Criando modelo preditivo')
# svm = SVC(kernel='rbf',gamma=5, C=100)
# svm.fit(Xtrain,Ytrain)

# #realizando previsões com o arquivo de
# print(' - Aplicando modelo e enviando para o servidor')
# abalone_app = pd.read_csv('abalone_app_min_max.csv')
# y_pred = svm.predict(abalone_app)
y_pred = pd.read_csv("respostas.csv")

# Enviando previsões realizadas com o modelo para o servidor
URL = "https://aydanomachado.com/mlclass/03_Validation.php"

#TODO Substituir pela sua chave aqui
DEV_KEY = 'Ponte de Safena'

# json para ser enviado para o servidor
y_pred1 =pd.Series(np.array(y_pred).transpose()[0])
data = {'dev_key':DEV_KEY,
        'predictions':y_pred.to_json(orient='values')}



 - Lendo o arquivo com o dataset sobre abalone


In [30]:
y_pred1

0       1
1       2
2       2
3       2
4       1
5       1
6       2
7       2
8       3
9       2
10      2
11      2
12      1
13      1
14      3
15      1
16      1
17      2
18      2
19      1
20      1
21      1
22      1
23      2
24      1
25      2
26      2
27      1
28      3
29      2
       ..
1014    2
1015    3
1016    2
1017    1
1018    3
1019    1
1020    1
1021    2
1022    3
1023    2
1024    1
1025    1
1026    1
1027    1
1028    1
1029    2
1030    2
1031    2
1032    1
1033    3
1034    1
1035    1
1036    2
1037    1
1038    1
1039    1
1040    1
1041    1
1042    1
1043    2
Length: 1044, dtype: int64

In [31]:
y_pred1.to_json(orient='values')

'[1,2,2,2,1,1,2,2,3,2,2,2,1,1,3,1,1,2,2,1,1,1,1,2,1,2,2,1,3,2,1,1,3,1,1,1,3,2,1,1,1,3,1,2,3,3,3,2,1,1,3,3,3,2,3,2,1,1,1,2,3,2,2,1,1,2,2,2,1,1,3,1,1,1,1,1,1,1,1,2,1,1,2,1,2,1,1,1,1,1,1,3,1,2,1,2,2,1,1,2,1,1,2,2,1,1,3,2,2,1,2,1,2,1,2,1,1,1,1,1,1,2,3,1,2,1,1,1,2,2,1,2,1,1,1,2,2,1,1,1,3,1,1,1,1,1,1,1,1,3,1,1,1,1,2,2,1,1,1,1,2,3,1,3,1,3,2,1,1,2,1,1,2,2,2,2,1,1,1,1,1,2,1,1,2,1,3,1,2,2,2,1,3,3,1,1,2,3,1,1,1,1,1,1,1,1,1,2,2,1,1,2,1,3,2,1,1,2,1,3,1,1,1,1,2,1,2,1,2,1,1,2,1,1,1,2,1,1,1,2,3,1,3,1,1,3,2,1,1,1,3,2,1,1,2,1,3,2,1,2,1,2,2,1,2,2,3,1,1,2,1,1,2,1,1,1,2,1,1,2,1,1,1,1,2,2,1,2,1,2,1,1,1,2,2,1,1,1,2,1,1,1,1,3,2,1,2,2,2,1,2,1,3,1,2,2,1,1,2,2,1,1,1,2,2,1,1,1,3,1,3,1,1,2,1,1,1,1,1,1,2,2,1,1,2,1,1,1,1,1,3,1,2,2,2,2,2,1,2,3,1,2,1,2,1,1,2,2,3,1,1,3,1,2,2,1,1,1,1,1,2,1,2,3,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,2,2,1,2,1,3,1,2,1,1,2,1,3,1,2,2,1,1,1,2,1,3,3,1,2,1,1,2,2,1,1,1,1,1,1,1,2,1,2,1,2,1,1,2,1,1,1,1,2,1,2,3,2,1,1,1,1,1,1,2,2,1,2,2,1,2,2,2,1,1,2,2,1,1,2,1,2,1,2,2,1,1,1,1,1,1,2,3,1,3,1,2,3,2,2,1,2,2,