In [22]:
import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler, LabelEncoder

#import graphviz

In [23]:
#Transformando a coluna de sexo de testo pra numero e considerando ela como dado categorico
df = pd.read_csv("abalone_dataset.csv")
#df['sex'] = LabelEncoder().fit_transform(df['sex'].tolist())
#df['sex'] = df['sex'].astype('category')
df = df.drop(columns = "sex")

In [24]:
# Preprocessamento da base
preps = [MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler]
# Modelos a serem testados
models = [SVC,LogisticRegression,MLPClassifier,RandomForestClassifier,DecisionTreeClassifier]
# Pipeline para testar todos os modelos com todos os preprocessamento
pipes = [make_pipeline(prepo(),model()) for model in models for prepo in preps]

In [25]:
results = []
for pipe in pipes:
    res = np.median(cross_validate(pipe,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10)["test_score"])
    results.append(np.append(np.array(pipe.steps)[:,0],res))











In [26]:
df.dtypes

length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
type                int64
dtype: object

In [27]:
pd.DataFrame(results,columns=["Preprocessing","Model","Median-Accuracy"]).sort_values(by="Median-Accuracy",ascending=False)

Unnamed: 0,Preprocessing,Model,Median-Accuracy
10,standardscaler,mlpclassifier,0.674077
11,maxabsscaler,mlpclassifier,0.656561
2,standardscaler,svc,0.656047
6,standardscaler,logisticregression,0.656007
9,normalizer,mlpclassifier,0.652863
8,minmaxscaler,mlpclassifier,0.647524
4,minmaxscaler,logisticregression,0.634185
7,maxabsscaler,logisticregression,0.632588
5,normalizer,logisticregression,0.632177
13,normalizer,randomforestclassifier,0.630407


# Pegamos o melhor modelo e preprocessamento, para testar no GridSearch

In [28]:
# Pipeline para o grid search
pipe = make_pipeline(StandardScaler(),MLPClassifier())
# Dicionario de parametros a serem testados pelo grid search
logparameters = {'logisticregression__penalty':['l2'], 'logisticregression__solver':('newton-cg', 'lbfgs', 'sag', 'saga'), 'logisticregression__C': (np.arange(10,100,10)), 'logisticregression__multi_class':['multinomial'], 'logisticregression__max_iter':[1000]}
mlpparameters = {'mlpclassifier__activation':['identity','logistic','tanh','relu'], 'mlpclassifier__solver':('adam', 'lbfgs','sgd'), 'mlpclassifier__alpha':(np.arange(0.1,1,0.1)), 'mlpclassifier__learning_rate':('constant','invscaling','adaptive'), 'mlpclassifier__max_iter':(np.arange(100,1000,100))}

In [29]:
# GridSearch com cros validation, testa o modelo com todas as combinações de parametros passadas no dicionario,
# e classifica a melhor de acordo com uma metrica que escolhermos, nesse caso a acuracia.
clf = GridSearchCV(pipe,mlpparameters,scoring="accuracy", cv=10, n_jobs=-1)

In [30]:
%%time
clf.fit(df.drop(columns="type"),df["type"])



CPU times: user 1min 37s, sys: 3.47 s, total: 1min 41s
Wall time: 3h 51min 24s


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu'], 'mlpclassifier__solver': ('adam', 'lbfgs', 'sgd'), 'mlpclassifier__alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9]), 'mlpclassifier__learning_rate': ('constant', 'invscaling', 'adaptive'), 'mlpclassifier__max_iter': array([100, 200, 300, 400, 500, 600, 700, 800, 900])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn

# Seleção de atributos recursivamente
- selecionamos a melhor combinação de hiperparametros do modelo com o grid search
- aplicamos a seleção de atributos nesse modelo

In [31]:
log = clf.best_estimator_.steps[1][1]

In [36]:
log

MLPClassifier(activation='tanh', alpha=0.30000000000000004, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [32]:
seletor = RFECV(log, cv=10, scoring='accuracy')

In [33]:
seletor.fit(df.drop(columns="type"),df["type"])

RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes

In [None]:
#Atributos selecionados
df.drop(columns="type").columns[seletor.get_support()]

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (accuracy)")
plt.plot(range(1, len(seletor.grid_scores_) + 1), seletor.grid_scores_)
plt.show()

In [None]:
# Gerando vetor resposta pra enviar ao servidor

In [None]:
teste = pd.read_csv("abalone_app.csv")
#teste['sex'] = LabelEncoder().fit_transform(teste['sex'].tolist())
#teste['sex'] = teste['sex'].astype('category')
teste = teste.drop(columns="sex")
base = teste[teste.columns[seletor.get_support()]]

In [None]:
base

In [None]:
pd.Series(seletor.estimator_.predict(base)).to_csv("respostas.csv",index=False)