In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error

In [6]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [7]:
# selecionar atributos dos professores que sao de interesse

tp_cols = ["siape", "formacao", "tipo_jornada_trabalho",
           "vinculo", "lotacao", "admissao", "categoria",
           "classe_funcional"]

tp_df = pd.read_csv("./perfis/docentes.csv", sep=";")
tp_df = tp_df[tp_cols]

tp_df

Unnamed: 0,siape,formacao,tipo_jornada_trabalho,vinculo,lotacao,admissao,categoria,classe_funcional
0,1543339,MESTRADO,Dedicação exclusiva,Ativo Permanente,NÚCLEO DE EDUCAÇÃO DA INFÂNCIA,2006/07/24 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...
1,1554468,DOUTORADO,Dedicação exclusiva,Ativo Permanente,ESCOLA AGRÍCOLA DE JUNDIAÍ,2008/09/12 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DV ...
2,1177821,MESTRADO,Dedicação exclusiva,Ativo Permanente,ESCOLA DE MÚSICA,1998/04/28 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...
3,2360824,MESTRADO,Dedicação exclusiva,Ativo Permanente,ESCOLA AGRÍCOLA DE JUNDIAÍ,2017/01/25 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIII ...
4,2364334,DOUTORADO,Dedicação exclusiva,Ativo Permanente,ESCOLA AGRÍCOLA DE JUNDIAÍ,2009/10/13 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...
...,...,...,...,...,...,...,...,...
2765,4246363,DOUTORADO,Dedicação exclusiva,Professor Visitante,INSTITUTO DE POLÍTICAS PÚBLICAS,2023/05/23 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,Adjunto ...
2766,3304576,DOUTORADO,Dedicação exclusiva,Professor Visitante,ESCOLA DE CIÊNCIAS E TECNOLOGIA,2022/08/12 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,Titular ...
2767,1056188,DOUTORADO,Dedicação exclusiva,Professor Visitante,DEPARTAMENTO DE BIOLOGIA CELULAR E GENÉTICA,2022/10/03 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,A ...
2768,3330361,DOUTORADO,Dedicação exclusiva,Professor Visitante,INSTITUTO METROPOLE DIGITAL,2023/02/15 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,A ...


In [8]:
tp_df.dtypes

siape                     int64
formacao                 object
tipo_jornada_trabalho    object
vinculo                  object
lotacao                  object
admissao                 object
categoria                object
classe_funcional         object
dtype: object

In [9]:
tp_df.describe()

Unnamed: 0,siape
count,2770.0
mean,2114588.0
std,1142222.0
min,12746.0
25%,1297595.0
50%,1810985.0
75%,2722937.0
max,9350807.0


In [10]:
# selecionar atributos que desejamos prever. o siape é incluso para unir as tabelas.

qualis = ["siape", "revista_a1", "revista_a2", "revista_b1",
          "revista_b2", "revista_b3", "revista_b4", "revista_b5", "revista_c"]

ti_df_list = []
for year in range(2010, 2021):
    ti_df_y = pd.read_csv(
        "./indicadores/indicadores-pesquisa-" + str(year) + ".csv", sep=";")
    ti_df_y = ti_df_y[qualis]
    ti_df_list.append(ti_df_y)

ti_df = pd.concat(ti_df_list)
ti_df = ti_df.groupby("siape", as_index=False).sum()
ti_df.describe()

Unnamed: 0,siape,revista_a1,revista_a2,revista_b1,revista_b2,revista_b3,revista_b4,revista_b5,revista_c
count,2756.0,2756.0,2756.0,2756.0,2756.0,2756.0,2756.0,2756.0,2756.0
mean,2112227.0,2.634978,1.933599,1.75,1.269231,0.818215,0.568578,0.05479,0.889332
std,1137681.0,6.073799,3.917333,5.375914,2.891393,2.413658,1.77848,0.330389,3.855806
min,12746.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1296285.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1808676.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2721404.0,3.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0
max,9350807.0,71.0,51.0,99.0,46.0,41.0,32.0,8.0,124.0


In [11]:
# unir ambas as tabelas e manter apenas as entradas que possuem siapes em comum

df = tp_df.merge(ti_df, on="siape", how="inner")
del df["siape"]

df

Unnamed: 0,formacao,tipo_jornada_trabalho,vinculo,lotacao,admissao,categoria,classe_funcional,revista_a1,revista_a2,revista_b1,revista_b2,revista_b3,revista_b4,revista_b5,revista_c
0,MESTRADO,Dedicação exclusiva,Ativo Permanente,NÚCLEO DE EDUCAÇÃO DA INFÂNCIA,2006/07/24 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...,0,0,0,0,0,0,0,0
1,DOUTORADO,Dedicação exclusiva,Ativo Permanente,ESCOLA AGRÍCOLA DE JUNDIAÍ,2008/09/12 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DV ...,1,1,5,0,0,4,0,0
2,MESTRADO,Dedicação exclusiva,Ativo Permanente,ESCOLA DE MÚSICA,1998/04/28 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...,0,0,0,0,0,0,0,0
3,MESTRADO,Dedicação exclusiva,Ativo Permanente,ESCOLA AGRÍCOLA DE JUNDIAÍ,2017/01/25 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIII ...,0,0,0,0,0,0,0,0
4,DOUTORADO,Dedicação exclusiva,Ativo Permanente,ESCOLA AGRÍCOLA DE JUNDIAÍ,2009/10/13 00:00:00.000000000,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748,DOUTORADO,Dedicação exclusiva,Professor Visitante,INSTITUTO DE POLÍTICAS PÚBLICAS,2023/05/23 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,Adjunto ...,0,0,0,1,0,0,0,1
2749,DOUTORADO,Dedicação exclusiva,Professor Visitante,ESCOLA DE CIÊNCIAS E TECNOLOGIA,2022/08/12 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,Titular ...,8,4,7,0,2,0,0,0
2750,DOUTORADO,Dedicação exclusiva,Professor Visitante,DEPARTAMENTO DE BIOLOGIA CELULAR E GENÉTICA,2022/10/03 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,A ...,4,2,5,0,0,0,0,0
2751,DOUTORADO,Dedicação exclusiva,Professor Visitante,INSTITUTO METROPOLE DIGITAL,2023/02/15 00:00:00.000000000,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,A ...,44,1,0,0,0,0,0,0


In [12]:
# converter a data de admissao para quantos semestres o professor está na universidade.

def num_semestres(data: str, data_atual: str):
    data = data[:10]
    anos = int(data_atual[6:]) - int(data[:4])
    if int(data_atual[3:5]) < 7 and int(data[5:7]) < 7 or int(data_atual[3:5]) >= 7 and int(data[5:7]) >= 7:
        return 2*anos
    elif int(data_atual[3:5]) < 7 and int(data[5:7]) > 7:
        return 2*anos - 1
    else:
        return 2*anos + 1

data_atual = "18/10/2023"

df['num_semestres'] = df['admissao'].apply(lambda x: num_semestres(x, data_atual))
df = df[["formacao", "tipo_jornada_trabalho", "vinculo", "categoria","classe_funcional", "lotacao", "num_semestres", "revista_a1",
         "revista_a2", "revista_b1", "revista_b2", "revista_b3", "revista_b4", "revista_c"]]
df

Unnamed: 0,formacao,tipo_jornada_trabalho,vinculo,categoria,classe_funcional,lotacao,num_semestres,revista_a1,revista_a2,revista_b1,revista_b2,revista_b3,revista_b4,revista_c
0,MESTRADO,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...,NÚCLEO DE EDUCAÇÃO DA INFÂNCIA,34,0,0,0,0,0,0,0
1,DOUTORADO,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DV ...,ESCOLA AGRÍCOLA DE JUNDIAÍ,30,1,1,5,0,0,4,0
2,MESTRADO,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...,ESCOLA DE MÚSICA,51,0,0,0,0,0,0,0
3,MESTRADO,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIII ...,ESCOLA AGRÍCOLA DE JUNDIAÍ,13,0,0,0,0,0,0,0
4,DOUTORADO,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,DIV ...,ESCOLA AGRÍCOLA DE JUNDIAÍ,28,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748,DOUTORADO,Dedicação exclusiva,Professor Visitante,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,Adjunto ...,INSTITUTO DE POLÍTICAS PÚBLICAS,1,0,0,0,1,0,0,1
2749,DOUTORADO,Dedicação exclusiva,Professor Visitante,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,Titular ...,ESCOLA DE CIÊNCIAS E TECNOLOGIA,2,8,4,7,0,2,0,0
2750,DOUTORADO,Dedicação exclusiva,Professor Visitante,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,A ...,DEPARTAMENTO DE BIOLOGIA CELULAR E GENÉTICA,2,4,2,5,0,0,0,0
2751,DOUTORADO,Dedicação exclusiva,Professor Visitante,PROFESSOR MAGISTERIO SUPERIOR - VISITANTE,A ...,INSTITUTO METROPOLE DIGITAL,1,44,1,0,0,0,0,0


In [13]:
df.dtypes

formacao                 object
tipo_jornada_trabalho    object
vinculo                  object
categoria                object
classe_funcional         object
lotacao                  object
num_semestres             int64
revista_a1                int64
revista_a2                int64
revista_b1                int64
revista_b2                int64
revista_b3                int64
revista_b4                int64
revista_c                 int64
dtype: object

In [14]:
print("existem", len(df), "entradas diferentes;")
print("existem", len(df["formacao"].unique()), "formacoes diferentes;")
print("existem", len(df["tipo_jornada_trabalho"].unique()),
      "tipos de jornada de trabalho diferentes;")
print("existem", len(df["vinculo"].unique()), "tipos de vinculo diferentes;")
print("existem", len(df["classe_funcional"].unique()),"classes funcionais diferentes;")
print("existem", len(df["categoria"].unique()),"categorias diferentes;")
print("existem", len(df["lotacao"].unique()), "lotacoes diferentes;")
print("existem", len(df["num_semestres"].unique()),"datas de admissao diferentes;")

existem 2753 entradas diferentes;
existem 6 formacoes diferentes;
existem 3 tipos de jornada de trabalho diferentes;
existem 8 tipos de vinculo diferentes;
existem 18 classes funcionais diferentes;
existem 7 categorias diferentes;
existem 135 lotacoes diferentes;
existem 89 datas de admissao diferentes;


In [15]:
for formacao in df["formacao"].unique():
    print(formacao, ": ", len(df[df["formacao"] == formacao]), sep="")

MESTRADO: 431
DOUTORADO: 2189
ESPECIALIZAÇÃO: 120
GRADUAÇÃO: 11
PÓS-DOUTORADO: 1
DESCONHECIDA: 1


In [16]:
# retirar professores de formação desconhecida.

df = df[df["formacao"] != "DESCONHECIDA"]
for formacao in df["formacao"].unique():
    print(formacao, ": ", len(df[df["formacao"] == formacao]), sep="")

MESTRADO: 431
DOUTORADO: 2189
ESPECIALIZAÇÃO: 120
GRADUAÇÃO: 11
PÓS-DOUTORADO: 1


In [17]:
# colocar uma ordem de classificacao. pos-doc > doc > mestrado > esp > grad

nivel_formacao = {"GRADUAÇÃO":1, "ESPECIALIZAÇÃO":2, "MESTRADO":3,"DOUTORADO":4,"PÓS-DOUTORADO":5}

df["formacao"].replace(nivel_formacao, inplace=True)
for formacao in df["formacao"].unique():
    print(formacao, ": ", len(df[df["formacao"] == formacao]), sep="")

3: 431
4: 2189
2: 120
1: 11
5: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["formacao"].replace(nivel_formacao, inplace=True)


In [18]:
for tipo_jornada_trabalho in df["tipo_jornada_trabalho"].unique():
    print(tipo_jornada_trabalho, ": ", len(
        df[df["tipo_jornada_trabalho"] == tipo_jornada_trabalho]), sep="")

Dedicação exclusiva           : 2155
20 horas semanais             : 307
40 horas semanais             : 290


In [19]:
for vinculo in df["vinculo"].unique():
    print(vinculo, ": ", len(df[df["vinculo"] == vinculo]), sep="")

Ativo Permanente: 2376
Celetista: 1
Colaborador PCCTAE e Magistério Federal: 2
Excedente de lotação: 3
Exercicio provisorio: 18
Professor Substituto: 260
Professor Temporario: 50
Professor Visitante: 42


In [20]:
# há poucas pessoas com esses atributos:

df = df[df["vinculo"] != "Celetista"]
df = df[df["vinculo"] != "Colaborador PCCTAE e Magistério Federal"]
df = df[df["vinculo"] != "Excedente de lotação"]
for vinculo in df["vinculo"].unique():
    print(vinculo, ": ", len(df[df["vinculo"] == vinculo]), sep="")

# então decidi retirá-los

Ativo Permanente: 2376
Exercicio provisorio: 18
Professor Substituto: 260
Professor Temporario: 50
Professor Visitante: 42


In [21]:
for categoria in df["categoria"].unique():
    print(categoria, ": ", len(df[df["categoria"] == categoria]), sep="")

PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO: 215
PROFESSOR DO MAGISTERIO SUPERIOR: 2178
PROFESSOR 3 GRAU                        : 1
PROF ENS BAS TEC TECNOLOGICO-SUBSTITUTO: 29
PROFESSOR DO MAGISTERIO SUPERIOR - SUBSTITUTO: 231
PROFESSOR DO MAGISTERIO SUPERIOR - TEMPORARIO: 50
PROFESSOR MAGISTERIO SUPERIOR - VISITANTE: 42


Veja que a informação de "SUBSTITUTO", "TEMPORARIO" e "VISITANTE" já está informada na coluna "vínculo". Então, irei retirá-la dos dados, assim como o único professor de terceiro grau.

In [22]:
retirar_vinculo = {"PROF ENS BAS TEC TECNOLOGICO-SUBSTITUTO":"PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO",
                   "PROFESSOR DO MAGISTERIO SUPERIOR - SUBSTITUTO":"PROFESSOR DO MAGISTERIO SUPERIOR",
                   "PROFESSOR DO MAGISTERIO SUPERIOR - TEMPORARIO":"PROFESSOR DO MAGISTERIO SUPERIOR",
                   "PROFESSOR MAGISTERIO SUPERIOR - VISITANTE":"PROFESSOR DO MAGISTERIO SUPERIOR"}

df = df[df["categoria"] != "PROFESSOR 3 GRAU                        "]

df["categoria"].replace(retirar_vinculo, inplace=True)

for categoria in df["categoria"].unique():
    print(categoria, ": ", len(df[df["categoria"] == categoria]), sep="")

PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO: 244
PROFESSOR DO MAGISTERIO SUPERIOR: 2501


Verifique que a soma de elementos de uma mesma categoria se manteve. Agora, iremos ranquear os professores com base em sua classe funcional:

In [23]:
for classe_funcional in df["classe_funcional"].unique():
    print(classe_funcional, ": ", len(df[df["classe_funcional"] == classe_funcional]), sep="")

DIV                                                                                                 : 75
DV                                                                                                  : 32
DIII                                                                                                : 78
DI                                                                                                  : 47
D                                                                                                   : 7
DII                                                                                                 : 4
Classe A - Adjunto A                                                                                : 141
Classe C - Adjunto                                                                                  : 757
Classe A - Auxiliar                                                                                 : 46
Classe E - Titular                                     

Como podemos ver, há 17 professores com classes não-informadas, e algmas estão repetidas e com outros nomes. Ainda, há 20 professores de classe A sem uma subclasse. O mapeamento, segundo a [PROGESP](https://progesp.ufrn.br/secao/carreira), se dá da seguinte maneira:

| Original | Mapeamento |
|-|-|
|DV|1|
|DIV|2|
|DIII|3|
|DII|4|
|DI|5|
|Classe E - Titular<br>Titular|6|
|Classe D - Associado<br>D|7|
|Classe C - Adjunto<br>Adjunto|8|
|Classe B - Assistente|9|
|Classe A - Auxiliar<br>Auxiliar|10|
|Classe A - Assistente A<br>A|11|
|Classe A - Adjunto A|12|

É importante notar também que iremos retirar professores sem categoria. E professores classe A sem denominação específica serão tratados como Assistentes, pois têm o valor médio da classe A.

In [24]:
mapear_class_func = {
    "DV                                                                                                  ":1,
    "DIV                                                                                                 ":2,
    "DIII                                                                                                ":3,
    "DII                                                                                                 ":4,
    "DI                                                                                                  ":5,
    "Classe E - Titular                                                                                  ":6,
    "Titular                                                                                             ":6,
    "Classe D - Associado                                                                                ":7,
    "D                                                                                                   ":7,
    "Classe C - Adjunto                                                                                  ":8,
    "Adjunto                                                                                             ":8,
    "Classe B - Assistente                                                                               ":9,
    "Classe A - Auxiliar                                                                                 ":10,
    "Auxiliar                                                                                            ":10,
    "Classe A - Assistente A                                                                             ":11,
    "A                                                                                                   ":11,
    "Classe A - Adjunto A                                                                                ":12,
}

df = df[df["classe_funcional"] != "Não Informada                                                                                       "]

df["classe_funcional"].replace(mapear_class_func, inplace=True)

In [25]:
for classe_funcional in df["classe_funcional"].unique():
    print(classe_funcional, ": ", len(df[df["classe_funcional"] == classe_funcional]), sep="")

2: 75
1: 32
3: 78
5: 47
7: 841
4: 4
12: 141
8: 766
10: 324
6: 320
9: 59
11: 42


In [26]:
df.reset_index(drop=True, inplace=True)
df.describe()

Unnamed: 0,formacao,classe_funcional,num_semestres,revista_a1,revista_a2,revista_b1,revista_b2,revista_b3,revista_b4,revista_c
count,2729.0,2729.0,2729.0,2729.0,2729.0,2729.0,2729.0,2729.0,2729.0,2729.0
mean,3.744229,7.521803,25.477831,2.646757,1.939538,1.755955,1.270429,0.814584,0.569073,0.89007
std,0.548749,2.113959,20.856943,6.09593,3.929876,5.398132,2.900249,2.394411,1.780601,3.868675
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,7.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,7.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,8.0,30.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0
max,5.0,12.0,97.0,71.0,51.0,99.0,46.0,41.0,32.0,124.0


In [27]:
df.describe(include="object")

Unnamed: 0,tipo_jornada_trabalho,vinculo,categoria,lotacao
count,2729,2729,2729,2729
unique,3,5,2,134
top,Dedicação exclusiva,Ativo Permanente,PROFESSOR DO MAGISTERIO SUPERIOR,ESCOLA AGRÍCOLA DE JUNDIAÍ
freq,2133,2376,2486,119


In [28]:
# df.drop("lotacao", axis=1, inplace=True)

In [29]:
keep = ['revista_a1',
        'revista_a2',
        'revista_b1',
        'revista_b2',
        'revista_b3',
        'revista_b4',
        'revista_c']

X = df.drop(keep, axis=1).copy()
X

Unnamed: 0,formacao,tipo_jornada_trabalho,vinculo,categoria,classe_funcional,lotacao,num_semestres
0,3,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,2,NÚCLEO DE EDUCAÇÃO DA INFÂNCIA,34
1,4,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,1,ESCOLA AGRÍCOLA DE JUNDIAÍ,30
2,3,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,2,ESCOLA DE MÚSICA,51
3,3,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,3,ESCOLA AGRÍCOLA DE JUNDIAÍ,13
4,4,Dedicação exclusiva,Ativo Permanente,PROFESSOR DE ENSINO BASICO TECNICO E TECNOLOGICO,2,ESCOLA AGRÍCOLA DE JUNDIAÍ,28
...,...,...,...,...,...,...,...
2724,4,Dedicação exclusiva,Professor Visitante,PROFESSOR DO MAGISTERIO SUPERIOR,8,INSTITUTO DE POLÍTICAS PÚBLICAS,1
2725,4,Dedicação exclusiva,Professor Visitante,PROFESSOR DO MAGISTERIO SUPERIOR,6,ESCOLA DE CIÊNCIAS E TECNOLOGIA,2
2726,4,Dedicação exclusiva,Professor Visitante,PROFESSOR DO MAGISTERIO SUPERIOR,11,DEPARTAMENTO DE BIOLOGIA CELULAR E GENÉTICA,2
2727,4,Dedicação exclusiva,Professor Visitante,PROFESSOR DO MAGISTERIO SUPERIOR,11,INSTITUTO METROPOLE DIGITAL,1


In [30]:
[ya1,
 ya2,
 yb1,
 yb2,
 yb3,
 yb4,
 yc] = [df[col].copy() for col in keep]

[ya1, ya2, yb1, yb2, yb3, yb4, yc]

[0        0
 1        1
 2        0
 3        0
 4        0
         ..
 2724     0
 2725     8
 2726     4
 2727    44
 2728     4
 Name: revista_a1, Length: 2729, dtype: int64,
 0       0
 1       1
 2       0
 3       0
 4       0
        ..
 2724    0
 2725    4
 2726    2
 2727    1
 2728    0
 Name: revista_a2, Length: 2729, dtype: int64,
 0       0
 1       5
 2       0
 3       0
 4       0
        ..
 2724    0
 2725    7
 2726    5
 2727    0
 2728    0
 Name: revista_b1, Length: 2729, dtype: int64,
 0       0
 1       0
 2       0
 3       0
 4       1
        ..
 2724    1
 2725    0
 2726    0
 2727    0
 2728    0
 Name: revista_b2, Length: 2729, dtype: int64,
 0       0
 1       0
 2       0
 3       0
 4       0
        ..
 2724    0
 2725    2
 2726    0
 2727    0
 2728    0
 Name: revista_b3, Length: 2729, dtype: int64,
 0       0
 1       4
 2       0
 3       0
 4       0
        ..
 2724    0
 2725    0
 2726    0
 2727    0
 2728    0
 Name: revista_b4, Length: 2

In [31]:
X_encoded = pd.get_dummies(X, columns=["tipo_jornada_trabalho",
                                       "vinculo",
                                       "categoria",
                                       "lotacao"])

X_encoded

Unnamed: 0,formacao,classe_funcional,num_semestres,tipo_jornada_trabalho_20 horas semanais,tipo_jornada_trabalho_40 horas semanais,tipo_jornada_trabalho_Dedicação exclusiva,vinculo_Ativo Permanente,vinculo_Exercicio provisorio,vinculo_Professor Substituto,vinculo_Professor Temporario,...,"lotacao_SECRETARIA DE EDUCAÇÃO BÁSICA, TÉCNICA E TECNOLÓGICA DA UFRN",lotacao_SECRETARIA DE GESTÃO DE PROJETOS,lotacao_SECRETARIA DE GOVERNANÇA INSTITUCIONAL,lotacao_SECRETARIA DE INCLUSÃO E ACESSIBILIDADE- SIA,lotacao_SECRETARIA DE RELAÇOES INTERNACIONAIS,lotacao_SUPERINTENDENCIA DE COMUNICACAO,lotacao_SUPERINTENDÊNCIA DE INFRAESTRUTURA,lotacao_SUPERINTENDÊNCIA DE TECNOLOGIA DA INFORMAÇÃO,lotacao_SUPERINTENDÊNCIA DO HUOL - EBSERH,lotacao_UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE
0,3,2,34,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,1,30,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,2,51,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,3,13,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,2,28,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2724,4,8,1,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2725,4,6,2,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2726,4,11,2,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2727,4,11,1,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
[(sum(ya1) / len(ya1)),
 (sum(ya2) / len(ya2)),
 (sum(yb1) / len(yb1)),
 (sum(yb2) / len(yb2)),
 (sum(yb3) / len(yb3)),
 (sum(yb4) / len(yb4)),
 (sum(yc) / len(yc))]

[2.646757053865885,
 1.939538292414804,
 1.7559545621106631,
 1.2704287284719677,
 0.8145840967387321,
 0.5690729204836936,
 0.8900696225723709]

In [33]:
# Lista de variáveis alvo
target_vars = [(ya1, "ya1"), (ya2, "ya2"), (yb1, "yb1"), (yb2, "yb2"), (yb3, "yb3"), (yb4, "yb4"), (yc, "yc")]

# Dicionário para armazenar os conjuntos de treinamento e teste
train_test_sets = {}

# Loop sobre as variáveis alvo
for var, var_name in target_vars:
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, var, random_state=42)
    train_test_sets[var_name] = (X_train, X_test, y_train, y_test)

# Agora, você pode acessar os conjuntos de treinamento e teste usando o nome da variável alvo desejada
X_train_ya1, X_test_ya1, ya1_train, ya1_test = train_test_sets["ya1"]
X_train_ya2, X_test_ya2, ya2_train, ya2_test = train_test_sets["ya2"]
X_train_yb1, X_test_yb1, yb1_train, yb1_test = train_test_sets["yb1"]
X_train_yb2, X_test_yb2, yb2_train, yb2_test = train_test_sets["yb2"]
X_train_yb3, X_test_yb3, yb3_train, yb3_test = train_test_sets["yb3"]
X_train_yb4, X_test_yb4, yb4_train, yb4_test = train_test_sets["yb4"]
X_train_yc, X_test_yc, yc_train, yc_test = train_test_sets["yc"]


In [34]:
[(sum(ya1_train) / len(ya1_train)),
 (sum(ya2_train) / len(ya2_train)),
 (sum(yb1_train) / len(yb1_train)),
 (sum(yb2_train) / len(yb2_train)),
 (sum(yb3_train) / len(yb3_train)),
 (sum(yb4_train) / len(yb4_train)),
 (sum(yc_train) / len(yc_train))]

[2.639784946236559,
 1.9472140762463344,
 1.6715542521994136,
 1.1901270772238515,
 0.8093841642228738,
 0.5650048875855328,
 0.9496578690127078]

In [35]:
[(sum(ya1_test) / len(ya1_test)),
 (sum(ya2_test) / len(ya2_test)),
 (sum(yb1_test) / len(yb1_test)),
 (sum(yb2_test) / len(yb2_test)),
 (sum(yb3_test) / len(yb3_test)),
 (sum(yb4_test) / len(yb4_test)),
 (sum(yc_test) / len(yc_test))]

[2.6676427525622253,
 1.916544655929722,
 2.0087847730600292,
 1.5109809663250366,
 0.8301610541727672,
 0.5812591508052709,
 0.7115666178623719]

In [36]:
param_space = {
    "max_depth": Integer(1,20),
    "n_estimators": Integer(10,1000),
    "reg_lambda": Real(0, 10),
    "eta": Real(0.01, 1),
    "gamma": Real(0,7)
}

In [37]:
reg_xgb_ya1 = xgb.XGBRegressor()

xgb_bayes_ya1 = BayesSearchCV(reg_xgb_ya1, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_ya1.fit(X_train_ya1, ya1_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [38]:
reg_xgb_ya2 = xgb.XGBRegressor()

xgb_bayes_ya2 = BayesSearchCV(reg_xgb_ya2, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_ya2.fit(X_train_ya2, ya2_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [39]:
reg_xgb_yb1 = xgb.XGBRegressor()

xgb_bayes_yb1 = BayesSearchCV(reg_xgb_yb1, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_yb1.fit(X_train_yb1, yb1_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [40]:
reg_xgb_yb2 = xgb.XGBRegressor()

xgb_bayes_yb2 = BayesSearchCV(reg_xgb_yb2, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_yb2.fit(X_train_yb2, yb2_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [41]:
reg_xgb_yb3 = xgb.XGBRegressor()

xgb_bayes_yb3 = BayesSearchCV(reg_xgb_yb3, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_yb3.fit(X_train_yb3, yb3_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [42]:
reg_xgb_yb4 = xgb.XGBRegressor()

xgb_bayes_yb4 = BayesSearchCV(reg_xgb_yb4, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_yb4.fit(X_train_yb4, yb4_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [43]:
reg_xgb_yc = xgb.XGBRegressor()

xgb_bayes_yc = BayesSearchCV(reg_xgb_yc, param_space, n_iter=32,
                                 scoring="neg_root_mean_squared_error",
                                 verbose=True, cv=5, n_jobs=8, random_state=42)

xgb_bayes_yc.fit(X_train_yc, yc_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [44]:
print(xgb_bayes_ya1.best_params_)
print(-xgb_bayes_ya1.best_score_)
estimator_ya1 = xgb_bayes_ya1.best_estimator_

print(xgb_bayes_ya2.best_params_)
print(-xgb_bayes_ya2.best_score_)
estimator_ya2 = xgb_bayes_ya2.best_estimator_

print(xgb_bayes_yb1.best_params_)
print(-xgb_bayes_yb1.best_score_)
estimator_yb1 = xgb_bayes_yb1.best_estimator_

print(xgb_bayes_yb2.best_params_)
print(-xgb_bayes_yb2.best_score_)
estimator_yb2 = xgb_bayes_yb2.best_estimator_

print(xgb_bayes_yb3.best_params_)
print(-xgb_bayes_yb3.best_score_)
estimator_yb3 = xgb_bayes_yb3.best_estimator_

print(xgb_bayes_yb4.best_params_)
print(-xgb_bayes_yb4.best_score_)
estimator_yb4 = xgb_bayes_yb4.best_estimator_

print(xgb_bayes_yc.best_params_)
print(-xgb_bayes_yc.best_score_)
estimator_yc = xgb_bayes_yc.best_estimator_

OrderedDict([('eta', 0.6819801039220021), ('gamma', 7.0), ('max_depth', 1), ('n_estimators', 178), ('reg_lambda', 0.7441291807581816)])
5.070935799081243
OrderedDict([('eta', 0.23795487705752305), ('gamma', 0.396483686417526), ('max_depth', 1), ('n_estimators', 1000), ('reg_lambda', 9.942953172304561)])
3.5111916498146747
OrderedDict([('eta', 0.0767425263728436), ('gamma', 5.684946091773975), ('max_depth', 18), ('n_estimators', 34), ('reg_lambda', 6.095441348912608)])
4.251398255522526
OrderedDict([('eta', 0.013594004182195795), ('gamma', 5.724810137646261), ('max_depth', 15), ('n_estimators', 262), ('reg_lambda', 5.786643362283849)])
2.484272186331002
OrderedDict([('eta', 0.39683716415545056), ('gamma', 7.0), ('max_depth', 1), ('n_estimators', 416), ('reg_lambda', 9.775113804233674)])
2.2985392668740716
OrderedDict([('eta', 0.01), ('gamma', 7.0), ('max_depth', 20), ('n_estimators', 323), ('reg_lambda', 10.0)])
1.5857074556695365
OrderedDict([('eta', 0.01), ('gamma', 7.0), ('max_depth'

In [45]:
predicoes_ya1 = estimator_ya1.predict(X_test_ya1)
print(np.sqrt(mean_squared_error(predicoes_ya1, ya1_test)))

predicoes_ya2 = estimator_ya2.predict(X_test_ya2)
print(np.sqrt(mean_squared_error(predicoes_ya2, ya2_test)))

predicoes_yb1 = estimator_yb1.predict(X_test_yb1)
print(np.sqrt(mean_squared_error(predicoes_yb1, yb1_test)))

predicoes_yb2 = estimator_yb2.predict(X_test_yb2)
print(np.sqrt(mean_squared_error(predicoes_yb2, yb2_test)))

predicoes_yb3 = estimator_yb3.predict(X_test_yb3)
print(np.sqrt(mean_squared_error(predicoes_yb3, yb3_test)))

predicoes_yb4 = estimator_yb4.predict(X_test_yb4)
print(np.sqrt(mean_squared_error(predicoes_yb4, yb4_test)))

predicoes_yc = estimator_yc.predict(X_test_yc)
print(np.sqrt(mean_squared_error(predicoes_yc, yc_test)))

5.939133634285067
3.375217650139085
5.350164057834889
3.1885227192079357
2.012453922553288
1.9316194348028335
2.4180963299695573


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  

In [46]:
pred_list = list(predicoes_ya1)
test_list = list(ya1_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(0, 1.6544214),
 (8, 7.742039),
 (0, 2.189793),
 (8, 2.1524053),
 (1, 0.9669367),
 (0, -0.20766462),
 (11, 4.661025),
 (0, -0.4518466),
 (1, -0.35034743),
 (4, 4.6611633),
 (1, 3.7931108),
 (5, 6.0076065),
 (1, 1.7208542),
 (3, 3.7942326),
 (3, 11.790098),
 (1, 1.6642836),
 (0, 0.8186923),
 (0, 2.169525),
 (0, 1.164871),
 (46, 5.015874),
 (0, 3.170341),
 (0, -0.58087105),
 (0, 2.189793),
 (1, 6.102502),
 (1, 5.490651),
 (0, 2.5031083),
 (4, 0.54121786),
 (0, 1.2558607),
 (1, 1.0175874),
 (0, 6.034958),
 (0, 2.6680322),
 (0, 1.5127999),
 (0, 2.1524053),
 (0, 1.5986435),
 (0, -0.3879854),
 (1, 2.5124419),
 (3, 4.2812314),
 (0, 2.1524053),
 (1, 2.1524053),
 (15, 5.0054884),
 (0, 2.1524053),
 (2, 4.0561666),
 (1, 2.1524053),
 (0, 1.4611434),
 (1, -0.20766462),
 (18, 20.063534),
 (0, 1.7137239),
 (0, -0.09987513),
 (0, 2.716701),
 (0, -1.1331767),
 (0, 1.677288),
 (0, -0.94504267),
 (0, 0.9688235),
 (0, 0.45726445),
 (0, 0.828889),
 (0, 0.061199486),
 (0, 0.33460346),
 (0, 4.113098),
 (0, 

In [47]:
pred_list = list(predicoes_ya2)
test_list = list(ya2_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(1, 0.3754376),
 (4, 3.960713),
 (2, 0.8912915),
 (3, 1.8856685),
 (0, 0.91466457),
 (0, -0.15808249),
 (9, 2.412734),
 (0, 0.050864726),
 (0, 0.76828074),
 (0, 3.7035556),
 (0, 3.5765655),
 (3, 2.5659642),
 (1, 0.40382892),
 (2, 2.1043515),
 (5, 8.5441475),
 (0, 1.5465772),
 (0, 1.3629595),
 (6, 1.5465772),
 (0, 1.3299563),
 (14, 3.910461),
 (0, 1.3541044),
 (1, -0.51117694),
 (0, 1.5247695),
 (2, 4.4269595),
 (2, 5.3409266),
 (2, 2.492778),
 (4, 0.848662),
 (2, 0.8505134),
 (0, 1.5002146),
 (0, 2.7486184),
 (0, 1.8856685),
 (0, 1.8856685),
 (2, 1.8856685),
 (0, 1.500721),
 (1, -0.28929603),
 (0, 1.0228771),
 (5, 3.9620216),
 (2, 1.532574),
 (5, 2.0964077),
 (4, 3.8517795),
 (0, 1.8856685),
 (1, 2.1868234),
 (8, 1.8856685),
 (0, 1.0718005),
 (0, -0.15808249),
 (2, 7.6842136),
 (0, -0.53207463),
 (0, -0.70136726),
 (0, 3.209096),
 (0, -1.3296324),
 (1, 0.3412611),
 (0, -0.072298236),
 (1, 1.3963557),
 (0, 0.8146434),
 (0, 0.9847631),
 (0, -0.53207463),
 (0, 0.2205848),
 (6, 6.6647935)

In [48]:
pred_list = list(predicoes_yb1)
test_list = list(yb1_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(0, 0.27145958),
 (7, 0.6519885),
 (0, 0.7205555),
 (1, 1.3723118),
 (0, 0.66657203),
 (0, 0.23549947),
 (6, 2.5315666),
 (0, 0.22211653),
 (0, 0.29010534),
 (0, 1.0012683),
 (1, 3.7577221),
 (0, 0.6900773),
 (0, 0.8930736),
 (0, 1.3788916),
 (3, 1.1051788),
 (1, 5.7627754),
 (0, 0.8018064),
 (0, 0.95534575),
 (0, 0.60691404),
 (3, 1.5779246),
 (0, 0.73096704),
 (1, 0.22211653),
 (14, 2.7583807),
 (0, 1.5779246),
 (5, 1.7291154),
 (0, 1.5779246),
 (9, 0.67121243),
 (3, 3.673317),
 (1, 1.084197),
 (0, 1.5130695),
 (2, 1.1601341),
 (2, 1.1710583),
 (0, 1.1601341),
 (0, 0.83180714),
 (1, 0.67121243),
 (1, 0.83180714),
 (9, 5.608441),
 (0, 1.1052563),
 (21, 36.766426),
 (1, 1.2189316),
 (2, 1.3723118),
 (0, 1.1601341),
 (1, 6.8561473),
 (2, 0.4940416),
 (0, 0.22211653),
 (0, 1.0594339),
 (0, 0.6339479),
 (0, 0.24135172),
 (3, 1.5423143),
 (0, 0.27145958),
 (37, 2.0821571),
 (0, 0.8709474),
 (0, 0.811098),
 (0, 1.0768529),
 (0, 0.83180714),
 (0, 0.6012102),
 (0, 0.27145958),
 (0, 1.1526284

In [49]:
pred_list = list(predicoes_yb2)
test_list = list(yb2_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(0, 0.3080121),
 (0, 0.77247417),
 (5, 0.6890715),
 (0, 0.81774443),
 (0, 0.4006054),
 (0, 0.3037108),
 (3, 4.013461),
 (0, 0.2903435),
 (0, 0.25301144),
 (0, 0.8418446),
 (2, 3.7718432),
 (0, 0.6612491),
 (1, 3.4605267),
 (4, 0.61599815),
 (8, 0.97944564),
 (3, 1.3219881),
 (2, 0.92117405),
 (0, 0.8168411),
 (0, 0.36316624),
 (1, 1.8655862),
 (1, 0.92450094),
 (0, 0.3037108),
 (5, 2.3077445),
 (1, 1.2215021),
 (4, 1.9969674),
 (3, 1.4520954),
 (8, 1.2867467),
 (4, 3.943832),
 (0, 0.92117405),
 (0, 1.8161957),
 (1, 1.0076436),
 (1, 0.94613606),
 (2, 1.0076436),
 (0, 0.76410985),
 (0, 0.6547131),
 (0, 0.911894),
 (11, 8.160089),
 (0, 0.7933142),
 (3, 3.655504),
 (0, 1.2724022),
 (0, 1.0076436),
 (3, 1.0076436),
 (0, 4.0303845),
 (1, 0.22983178),
 (1, 0.3037108),
 (2, 0.5201057),
 (0, 0.20307928),
 (0, 0.20307928),
 (1, 1.0076436),
 (0, 0.20307928),
 (8, 0.70379627),
 (0, 0.4371537),
 (1, 1.3194852),
 (2, 0.52191854),
 (0, 0.5145752),
 (0, 0.20307928),
 (0, 0.25301144),
 (0, 0.911894),


In [50]:
pred_list = list(predicoes_yb3)
test_list = list(yb3_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(0, 0.31590325),
 (2, 1.1978548),
 (0, 0.46143058),
 (0, 0.7979511),
 (0, 0.5043288),
 (0, 0.31590325),
 (5, 1.3421016),
 (0, -0.10761082),
 (0, 0.5781252),
 (0, 1.3105289),
 (2, 0.5210029),
 (0, 0.5210029),
 (1, 0.5210029),
 (1, 0.40832862),
 (3, 1.3605589),
 (1, 0.5210029),
 (0, 0.5210029),
 (0, 0.5210029),
 (0, 0.5043288),
 (1, 1.5843982),
 (0, 0.7479211),
 (0, -0.10761082),
 (1, 0.866967),
 (1, 1.1669137),
 (0, 0.95202),
 (2, 1.6303735),
 (5, 0.34875652),
 (0, 0.7479211),
 (0, 0.5710329),
 (0, 1.1669137),
 (2, 0.7479211),
 (1, 0.7979511),
 (2, 0.7479211),
 (0, 0.7979511),
 (0, 0.34875652),
 (0, 0.7979511),
 (2, 0.7479211),
 (1, 0.7479211),
 (6, 2.6951747),
 (0, 1.3605589),
 (3, 0.7979511),
 (0, 0.7979511),
 (0, 0.7979511),
 (0, 0.70466936),
 (0, 0.20322923),
 (0, 1.3605589),
 (0, -0.078979746),
 (0, 0.19553183),
 (0, 1.9720318),
 (0, -0.13855185),
 (0, 2.55156),
 (0, 0.2383533),
 (3, 0.5210029),
 (2, 0.064635366),
 (0, 0.7979511),
 (0, -0.078979746),
 (0, 0.005063529),
 (0, 0.7979

In [51]:
pred_list = list(predicoes_yb4)
test_list = list(yb4_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(0, 0.2187954),
 (0, 0.3952857),
 (0, 0.3474573),
 (0, 0.990039),
 (3, 0.5865002),
 (0, 0.2187954),
 (0, 0.9201729),
 (0, 0.46053895),
 (0, 0.23324159),
 (4, 0.4064731),
 (3, 0.36915997),
 (0, 0.3474573),
 (0, 0.3474573),
 (0, 0.3362699),
 (1, 1.6723902),
 (0, 0.36915997),
 (0, 0.36915997),
 (5, 0.36915997),
 (0, 0.3982599),
 (13, 0.8382749),
 (0, 0.42221588),
 (0, 0.2187954),
 (0, 0.3362699),
 (7, 0.7884308),
 (0, 0.79429656),
 (0, 0.7884308),
 (3, 0.3362699),
 (0, 0.42221588),
 (0, 0.36915997),
 (0, 0.79429656),
 (0, 0.42221588),
 (0, 0.49531367),
 (0, 0.42221588),
 (0, 0.52076805),
 (0, 0.3362699),
 (0, 0.52076805),
 (1, 0.42221588),
 (0, 0.42221588),
 (1, 1.0891188),
 (0, 0.8382749),
 (0, 0.49531367),
 (0, 0.42221588),
 (0, 9.514479),
 (0, 0.4064731),
 (0, 0.2187954),
 (0, 0.5896965),
 (0, 0.23910733),
 (0, 0.2187954),
 (6, 1.7221359),
 (0, 0.2187954),
 (2, 0.6128405),
 (0, 0.2187954),
 (1, 0.36190343),
 (0, 0.2187954),
 (0, 0.42980567),
 (0, 0.23087253),
 (0, 0.2187954),
 (0, 0.5

In [52]:
pred_list = list(predicoes_yc)
test_list = list(yc_test)
complist = []
for i in range(len(test_list)):
    complist.append((test_list[i], pred_list[i]))
complist

[(0, 0.37555546),
 (0, 0.79886574),
 (0, 0.64915556),
 (0, 0.65919656),
 (1, 0.9946021),
 (0, 0.36574942),
 (0, 1.7350303),
 (0, 0.36574942),
 (0, 0.37555546),
 (0, 0.80867165),
 (0, 1.0314583),
 (1, 0.64915556),
 (0, 0.64915556),
 (3, 0.63934964),
 (0, 1.637487),
 (0, 0.65919656),
 (0, 0.65919656),
 (1, 0.65919656),
 (3, 0.84821296),
 (2, 0.89702636),
 (0, 0.65919656),
 (0, 0.36574942),
 (4, 0.64915556),
 (0, 0.85174423),
 (3, 0.85174423),
 (0, 0.85174423),
 (5, 0.63934964),
 (0, 0.65919656),
 (2, 0.65919656),
 (4, 1.0433902),
 (1, 0.65919656),
 (0, 0.65919656),
 (0, 0.65919656),
 (0, 0.65919656),
 (0, 0.63934964),
 (0, 0.65919656),
 (5, 1.0314583),
 (0, 0.65919656),
 (9, 5.009901),
 (0, 0.89702636),
 (0, 0.65919656),
 (0, 0.65919656),
 (1, 0.78790027),
 (0, 0.7729654),
 (0, 0.36574942),
 (0, 0.89702636),
 (0, 0.37555546),
 (0, 0.37555546),
 (0, 0.65919656),
 (0, 0.37555546),
 (8, 4.435337),
 (0, 0.36574942),
 (0, 0.64915556),
 (0, 0.37555546),
 (0, 0.65919656),
 (0, 0.37555546),
 (0,