# Recomendação de Parâmetros em Workflows Científicos

## *KNN Recommender*

### 1 - Lendos os dados

In [116]:
import pandas as pd
#lendo dados originais
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,num_aligns,length,model1,prob1,model2,prob2,erro
0,10.0,237.0,WAG+G,1588.458801,WAG+G,1588.458801,False
1,10.0,237.0,WAG+G,1588.458801,WAG+G,1588.458801,False
2,10.0,237.0,WAG+G,1588.458801,WAG+G,1588.458801,False
3,10.0,237.0,WAG+G,1588.458801,WAG+G,1588.458801,False
4,10.0,237.0,WAG+G,1588.458801,WAG+G,1588.458801,False


### 2 - Pré-Processamento

In [117]:
#selecionando apenas os dados de execução de sucesso
data[data['erro'] == False]
data = data.drop('erro', 1)
#forço as colunas de prob serem strings para o one hot encoding levá-las em consideração
data['prob1'] = data['prob1'].astype(str)
data['prob2'] = data['prob2'].astype(str)
data['num_aligns'] = data['num_aligns'].astype(str)
data['length'] = data['length'].astype(str)
#convertendo features categoricas em binarias
data = pd.get_dummies(data)
data.head()

Unnamed: 0,num_aligns_10.0,num_aligns_11.0,num_aligns_9.0,length_1039.0,length_117.0,length_120.0,length_125.0,length_129.0,length_142.0,length_147.0,...,prob2_635.874188478,prob2_655.23910026,prob2_657.617722881,prob2_703.934317936,prob2_787.499472235,prob2_804.670440867,prob2_804.952769924,prob2_836.781299518,prob2_917.715801421,prob2_991.427723496
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
#convertendo data em transposta
data = data.transpose()
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
num_aligns_10.0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
num_aligns_11.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
num_aligns_9.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
length_1039.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
length_117.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3 - Treinandoo o modelo KNN

In [119]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
#treinando o KNN
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(data)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

### 4 - Realizando recomendações

In [120]:
print('Data shape: %s' % data.shape[0] )
#escolho um valor de parametro aletório (row de data)
query_index = np.random.choice(data.shape[0])
print('Query Index: %i' % query_index )
#recupero o valor deste paramêtro para todas as execuções(no caso binario, 
#1 todas as execuções que usaram o parametro e 0 cc)
feature_base = data.iloc[query_index, :].values.reshape(1, -1)
# as execuções que tiveram esse parâmetro
#print('Parameter Feature Base: %s' % feature_base )
# a distância é calculada com relação as execuções que tiveram a query e as outras features juntas
distances, indices = model_knn.kneighbors(feature_base, n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(data.index[query_index]) )
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, data.index[indices.flatten()[i]], distances.flatten()[i]))

Data shape: 541
Query Index: 108
Recommendations for length_479.0:

1: length_479.0, with distance of 0.0:
2: prob2_3482.44000279, with distance of 0.0:
3: model2_WAG+I, with distance of 0.8525580438451028:
4: model1_RtREV+I+F, with distance of 0.8709005551264195:
5: num_aligns_10.0, with distance of 0.9489023869692403:
