In [1]:
import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler, LabelEncoder

#import graphviz

  from numpy.core.umath_tests import inner1d


In [2]:
#Transformando a coluna de sexo de testo pra numero e considerando ela como dado categorico
df = pd.read_csv("abalone_dataset.csv")
#df['sex'] = LabelEncoder().fit_transform(df['sex'].tolist())
#df['sex'] = df['sex'].astype('category')
df = df.drop(columns = "sex")

In [3]:
# Preprocessamento da base
preps = [MinMaxScaler,Normalizer,StandardScaler,MaxAbsScaler]
# Modelos a serem testados
models = [SVC,LogisticRegression,MLPClassifier,RandomForestClassifier,DecisionTreeClassifier]
# Pipeline para testar todos os modelos com todos os preprocessamento
pipes = [make_pipeline(prepo(),model()) for model in models for prepo in preps]

In [4]:
results = []
for pipe in pipes:
    res = np.median(cross_validate(pipe,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10)["test_score"])
    results.append(np.append(np.array(pipe.steps)[:,0],res))

In [5]:
df.dtypes

length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
type                int64
dtype: object

In [6]:
pd.DataFrame(results,columns=["Preprocessing","Model","Median-Accuracy"]).sort_values(by="Median-Accuracy",ascending=False)

Unnamed: 0,Preprocessing,Model,Median-Accuracy
10,standardscaler,mlpclassifier,0.663473
2,standardscaler,svc,0.656047
6,standardscaler,logisticregression,0.656007
11,maxabsscaler,mlpclassifier,0.644799
8,minmaxscaler,mlpclassifier,0.643207
9,normalizer,mlpclassifier,0.643197
4,minmaxscaler,logisticregression,0.634185
12,minmaxscaler,randomforestclassifier,0.633178
7,maxabsscaler,logisticregression,0.632588
5,normalizer,logisticregression,0.632177


# Pegamos o melhor modelo e preprocessamento, para testar no GridSearch

In [7]:
# Pipeline para o grid search
pipe = make_pipeline(StandardScaler(),LogisticRegression())
# Dicionario de parametros a serem testados pelo grid search
logparameters = {'logisticregression__penalty':['l2'], 'logisticregression__solver':('newton-cg', 'lbfgs', 'sag', 'saga'), 'logisticregression__C': (np.arange(10,100,10)), 'logisticregression__multi_class':['multinomial'], 'logisticregression__max_iter':[1000]}
mlpparameters = {'mlpclassifier__activation':['identity','logistic','tanh','relu'], 'mlpclassifier__solver':('adam', 'lbfgs','sgd'), 'mlpclassifier__alpha':(np.arange(0.1,1,0.1)), 'mlpclassifier__learning_rate':('constant','invscaling','adaptive'), 'mlpclassifier__max_iter':(np.arange(100,1000,100))}

In [8]:
# GridSearch com cros validation, testa o modelo com todas as combinações de parametros passadas no dicionario,
# e classifica a melhor de acordo com uma metrica que escolhermos, nesse caso a acuracia.
clf = GridSearchCV(pipe,logparameters,scoring="accuracy", cv=10, n_jobs=-1)

In [9]:
%%time
#clf.fit(df.drop(columns="type"),df["type"])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs


# Seleção de atributos recursivamente
- selecionamos a melhor combinação de hiperparametros do modelo com o grid search
- aplicamos a seleção de atributos nesse modelo

In [10]:
log = clf.best_estimator_.steps[1][1]

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
seletor = RFECV(log, cv=10, scoring='accuracy')

In [None]:
seletor.fit(df.drop(columns="type"),df["type"])

In [None]:
#Atributos selecionados
df.drop(columns="type").columns[seletor.get_support()]

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (accuracy)")
plt.plot(range(1, len(seletor.grid_scores_) + 1), seletor.grid_scores_)
plt.show()

In [None]:
# Gerando vetor resposta pra enviar ao servidor

In [11]:
mlp = MLPClassifier(activation='tanh', alpha=0.30000000000000004,hidden_layer_sizes=(16,), learning_rate='invscaling',learning_rate_init=0.001, max_iter=100, solver='lbfgs')

In [12]:
bla = cross_validate(mlp,df.drop(columns="type"),df["type"],scoring="accuracy",cv=10, n_jobs=-1)

In [13]:
bla



{'fit_time': array([0.79939818, 0.75017953, 0.80165863, 0.7483623 , 0.79470658,
        0.74921656, 0.91039515, 0.72543001, 0.43985629, 0.50829792]),
 'score_time': array([0.00160551, 0.00317621, 0.00148869, 0.00354409, 0.00202608,
        0.00123692, 0.00111008, 0.00125289, 0.00109529, 0.00108957]),
 'test_score': array([0.66666667, 0.65605096, 0.68152866, 0.66134185, 0.66453674,
        0.63578275, 0.65495208, 0.60702875, 0.68589744, 0.69871795]),
 'train_score': array([0.66347178, 0.66926899, 0.66501065, 0.66761263, 0.66371054,
        0.66442001, 0.66725789, 0.67364314, 0.66489362, 0.66134752])}

In [14]:
df = pd.read_csv("abalone_dataset.csv")
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
0,2,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,3
1,1,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,1
2,1,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,1
3,2,0.55,0.45,0.17,0.81,0.317,0.157,0.22,3
4,1,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,1


In [15]:
df.dtypes

sex                 int64
length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
type                int64
dtype: object

In [16]:
df = df.drop(columns="sex")
type = df["type"]

In [17]:
scaler = MinMaxScaler(feature_range=(0.1,0.9))
dfscaled = scaler.fit_transform(df.drop(columns="type"), df["type"])
dfscaled = pd.DataFrame(dfscaled, columns=df.columns[:-1])

In [18]:
dfscaled = pd.merge(dfscaled,pd.DataFrame(type),right_index=True,left_index=True)

In [19]:
mlp.fit(dfscaled.drop(columns="type"),dfscaled["type"])
teste = pd.read_csv("abalone_app.csv")
#teste['sex'] = LabelEncoder().fit_transform(teste['sex'].tolist())
#teste['sex'] = teste['sex'].astype('category')
teste = teste.drop(columns="sex")
testescaled = scaler.fit_transform(teste)
testescaled = pd.DataFrame(testescaled,columns=teste.columns)
pd.Series(mlp.predict(testescaled)).to_csv("respostas.csv",index=False)

In [20]:
y_pred = pd.read_csv("respostas.csv").values

In [21]:
y_pred

array([[2],
       [2],
       [3],
       ...,
       [1],
       [1],
       [2]])