In [1]:
import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler, LabelEncoder

#import graphviz

In [2]:
#Transformando a coluna de sexo de testo pra numero e considerando ela como dado categorico
df = pd.read_csv("abalone_dataset.csv")
# df['sex'] = LabelEncoder().fit_transform(df['sex'].tolist())
# df['sex'] = df['sex'].astype('category')
df = df.drop(columns = "sex")

In [3]:
# Preprocessamento da base
preps = [MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler]
# Modelos a serem testados
models = [SVC,LogisticRegression,MLPClassifier,RandomForestClassifier,DecisionTreeClassifier]
# Pipeline para testar todos os modelos com todos os preprocessamento
pipes = [make_pipeline(prepo(),model()) for model in models for prepo in preps]

In [4]:
# results = []
# for pipe in pipes:
#     res = np.median(cross_validate(pipe,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10)["test_score"])
#     results.append(np.append(np.array(pipe.steps)[:,0],res))

In [5]:
df.dtypes

length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
type                int64
dtype: object

In [6]:
# pd.DataFrame(results,columns=["Preprocessing","Model","Median-Accuracy"]).sort_values(by="Median-Accuracy",ascending=False)

# Pegamos o melhor modelo e preprocessamento, para testar no GridSearch

In [7]:
# Pipeline para o grid search
pipe = make_pipeline(MinMaxScaler(feature_range=(0.1,1.0)),MLPClassifier())
# Dicionario de parametros a serem testados pelo grid search
logparameters = {'logisticregression__penalty':['l2'], 'logisticregression__solver':('newton-cg', 'lbfgs', 'sag', 'saga'), 'logisticregression__C': (np.arange(10,100,10)), 'logisticregression__multi_class':['multinomial'], 'logisticregression__max_iter':[1000]}
mlpparameters = {'mlpclassifier__activation':['tanh'], 'mlpclassifier__solver': ['lbfgs'], 'mlpclassifier__max_iter': [100], 'mlpclassifier__alpha': [0.3], 'mlpclassifier__hidden_layer_sizes':[(16,8)]}

In [8]:
# GridSearch com cros validation, testa o modelo com todas as combinações de parametros passadas no dicionario,
# e classifica a melhor de acordo com uma metrica que escolhermos, nesse caso a acuracia.
clf = GridSearchCV(pipe,mlpparameters,scoring="accuracy", cv=10, n_jobs=-1)

In [9]:
%%time
clf.fit(df.drop(columns="type"),df["type"])

CPU times: user 1.41 s, sys: 1.49 s, total: 2.91 s
Wall time: 10.5 s


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0.1, 1.0))), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       lea...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'mlpclassifier__activation': ['tanh'], 'mlpclassifier__solver': ['lbfgs'], 'mlpclassifier__max_iter': [100], 'mlpclassifier__alpha': [0.3], 'mlpclassifier__hidden_layer_sizes': [(16, 8)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

# Seleção de atributos recursivamente
- selecionamos a melhor combinação de hiperparametros do modelo com o grid search
- aplicamos a seleção de atributos nesse modelo

In [10]:
clf.best_score_

0.6590038314176245

In [11]:
clf.best_estimator_

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0.1, 1.0))), ('mlpclassifier', MLPClassifier(activation='tanh', alpha=0.3, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(16, 8), learning_rate='constant',
       learn...True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))])

In [12]:
#seletor = RFECV(log, cv=10, scoring='accuracy')

In [13]:
#seletor.fit(df.drop(columns="type"),df["type"])

In [14]:
#Atributos selecionados
#df.drop(columns="type").columns[seletor.get_support()]

In [15]:
# plt.figure()
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross validation score (accuracy)")
# plt.plot(range(1, len(seletor.grid_scores_) + 1), seletor.grid_scores_)
# plt.show()

In [16]:
# Gerando vetor resposta pra enviar ao servidor

In [17]:
#mlp = MLPClassifier(activation='tanh', alpha=0.30000000000000004,hidden_layer_sizes=(16,), learning_rate='invscaling',learning_rate_init=0.1, max_iter=100, solver='lbfgs')

In [18]:
#bla = cross_validate(mlp,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10, n_jobs=-1)

In [19]:
# df = pd.read_csv("abalone_dataset.csv")
# df.head()

# df.dtypes

# df = df.drop(columns="sex")
# type = df["type"]

scaler = MinMaxScaler(feature_range=(0.1,1.0))
# dfscaled = scaler.fit_transform(df.drop(columns="type"), df["type"])
# dfscaled = pd.DataFrame(dfscaled, columns=df.columns[:-1])

# dfscaled = pd.merge(dfscaled,pd.DataFrame(type),right_index=True,left_index=True)

In [20]:
#mlp.fit(dfscaled.drop(columns="type"),dfscaled["type"])
teste = pd.read_csv("abalone_app.csv")
# teste['sex'] = LabelEncoder().fit_transform(teste['sex'].tolist())
#teste['sex'] = teste['sex'].astype('category')
teste = teste.drop(columns="sex")
testescaled = scaler.fit_transform(teste)
testescaled = pd.DataFrame(testescaled,columns=teste.columns)
pd.Series(clf.best_estimator_.predict(testescaled)).to_csv("respostas.csv",index=False)

In [21]:
y_pred = pd.read_csv("respostas.csv").values

In [22]:
y_pred

array([[2],
       [2],
       [3],
       ...,
       [2],
       [3],
       [2]])

In [23]:
import requests

print('\n - Lendo o arquivo com o dataset sobre abalone')

# abalone = pd.read_csv('abalone_min_max.csv')

# # Criando X and y par ao algorítmo de aprendizagem de máquina.
# print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo')
# X,Y = abalone[abalone.columns[:-1]],abalone[abalone.columns[-1]]
# Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, stratify = Y, random_state=66, test_size=0.10)

# # Ciando o modelo preditivo para a base trabalhada
# print(' - Criando modelo preditivo')
# svm = SVC(kernel='rbf',gamma=5, C=100)
# svm.fit(Xtrain,Ytrain)

# #realizando previsões com o arquivo de
# print(' - Aplicando modelo e enviando para o servidor')
# abalone_app = pd.read_csv('abalone_app_min_max.csv')
# y_pred = svm.predict(abalone_app)
y_pred = pd.read_csv("respostas.csv")

# Enviando previsões realizadas com o modelo para o servidor
URL = "https://aydanomachado.com/mlclass/03_Validation.php"

#TODO Substituir pela sua chave aqui
DEV_KEY = 'Ponte de Safena'

# json para ser enviado para o servidor
y_pred1 =pd.Series(np.array(y_pred).transpose()[0])
data = {'dev_key':DEV_KEY,
        'predictions':y_pred.to_json(orient='values')}

# Enviando requisição e salvando o objeto resposta
r = requests.post(url = URL, data = data)

# Extraindo e imprimindo o texto da resposta
pastebin_url = r.text
print(" - Resposta do servidor:\n", r.text, "\n")


 - Lendo o arquivo com o dataset sobre abalone
 - Resposta do servidor:
 {"error":{"code":102,"message":"Espere ao menos 12 horas entre dois envios, tempo restante 00 dias 00 horas 50 minutos 42 segundos"}} 



In [24]:
y_pred1.to_json(orient='values')

'[2,2,3,3,2,3,2,3,3,3,2,2,3,2,3,2,3,3,2,2,2,2,2,2,2,2,2,3,3,2,2,3,3,2,2,2,3,2,2,2,3,3,3,2,3,3,3,3,2,2,3,3,3,2,3,3,2,3,2,3,3,2,3,2,2,2,2,3,1,2,3,2,2,3,3,3,2,2,2,3,2,2,2,2,3,2,2,2,3,3,2,3,2,2,2,3,2,2,1,2,3,2,3,2,2,3,3,2,2,2,2,2,3,3,2,2,2,2,2,2,2,2,3,3,3,2,3,1,3,2,1,3,2,2,3,2,3,2,2,2,3,2,2,3,2,2,2,1,2,3,3,2,2,2,2,3,2,3,2,2,2,3,2,3,2,3,2,2,2,2,2,3,2,2,2,2,2,2,2,1,2,2,2,2,2,2,3,3,2,2,2,2,3,3,2,2,2,3,1,2,2,1,2,2,1,3,2,2,3,1,2,3,2,3,2,1,2,3,3,3,2,2,2,1,3,2,2,2,3,3,1,3,2,3,2,3,3,2,2,2,3,2,3,3,3,3,2,2,2,2,3,2,2,2,3,2,3,2,3,2,2,2,3,2,2,3,3,2,2,2,2,2,2,2,2,2,3,3,3,2,2,2,2,2,3,2,2,2,2,3,2,2,2,3,2,2,2,2,2,2,2,2,2,3,3,2,3,2,2,2,2,3,3,2,2,2,3,1,3,3,2,2,2,2,3,3,3,2,3,2,3,3,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,1,3,2,3,2,2,3,2,2,2,3,3,3,2,2,3,3,2,2,2,2,3,1,1,3,2,2,2,2,1,3,2,2,2,2,3,3,2,2,2,2,2,2,3,3,3,2,3,2,1,2,2,2,3,2,2,2,2,2,3,3,2,2,2,3,3,3,2,2,2,3,2,2,2,2,3,3,2,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,2,3,2,3,3,3,3,3,3,3,2,2,3,2,3,2,2,2,3,3,3,3,2,2,2,2,2,2,2,1,2,2,3,2,2,3,3,2,2,3,2,3,3,3,2,3,2,2,3,2,2,2,2,2,