In [72]:
from datetime import datetime, date
import pandas as pd
import plotly.express as px

from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler

### Modelos utilizados ###

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.getOrCreate()

df = spark.read.option("delimiter", ";").option("header", True).csv('Data/inmet_filtered_A401_H_2000-05-12_2023-05-16.csv')

dfTransformado = df.withColumn('PRECIPITACAO TOTAL, HORARIO(mm)', regexp_replace('PRECIPITACAO TOTAL, HORARIO(mm)', ',', '.').cast(DoubleType()))

dfTransformado = dfTransformado.drop(
           'PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)',
           'PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)',
           'PRESSAO ATMOSFERICA MAX.NA HORA ANT. (AUT)(mB)',
           'PRESSAO ATMOSFERICA MIN. NA HORA ANT. (AUT)(mB)',
           'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA(mB)',
           'TEMPERATURA DA CPU DA ESTACAO(°C)',
           'TEMPERATURA DO AR - BULBO SECO, HORARIA(°C)',
           'TEMPERATURA DO PONTO DE ORVALHO(°C)',
           'TEMPERATURA MAXIMA NA HORA ANT. (AUT)(°C)',
           'TEMPERATURA MINIMA NA HORA ANT. (AUT)(°C)',
           'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT)(°C)',
           'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT)(°C)',
           'TENSAO DA BATERIA DA ESTACAO(V)',
           'UMIDADE REL. MAX. NA HORA ANT. (AUT)(%)',
           'UMIDADE REL. MIN. NA HORA ANT. (AUT)(%)',
           'UMIDADE RELATIVA DO AR, HORARIA(%)',
           'VENTO, DIRECAO HORARIA (gr)(° (gr))',
           'VENTO, RAJADA MAXIMA(m/s)',
           'VENTO, VELOCIDADE HORARIA(m/s)',
           'Unnamed: 22',
           '_c22'
)

df = dfTransformado.withColumn("Inicio_Semana",date_sub(next_day(col("Data Medicao"),"sunday"),7))\
                    .groupBy("Inicio_Semana").agg\
                        (sum("PRECIPITACAO TOTAL, HORARIO(mm)").cast("float").alias("Total de Chuvas(mm)"),\
                         sum("PRECIPITACAO TOTAL, HORARIO(mm)").cast("float").alias("Média diária de chuvas(mm)"))\
                    .orderBy("Inicio_Semana")

df = df.select('Inicio_Semana','Total de Chuvas(mm)',col('Média diária de chuvas(mm)')/ 7 )

df = df.withColumnRenamed("(Média diária de chuvas(mm) / 7)","Média diária de chuvas(mm)")

df1 = df.withColumn('Semana_Ano',weekofyear(df.Inicio_Semana))
df1.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano
0,2000-05-07,,,18
1,2000-05-14,,,19
2,2000-05-21,,,20
3,2000-05-28,,,21
4,2000-06-04,,,22
...,...,...,...,...
1197,2023-04-16,6.600000,0.942857,15
1198,2023-04-23,94.800003,13.542858,16
1199,2023-04-30,49.400002,7.057143,17
1200,2023-05-07,117.800003,16.828572,18


In [73]:
#Farol da Barra - SSA FB 100 | Farol da Barra - SSA FB 200 | Porto da Barra - SSA PB 100 | Santa Maria - SSA SM 100

#dfCompleto = df1.withColumn('Farol da Barra - SSA FB 100', when(rand() > 0.5, 'Imprópria').otherwise('Própria'))\
#                .withColumn("Farol da Barra - SSA FB 200", when(rand() > 0.5, 'Imprópria').otherwise('Própria'))\
#                .withColumn("Porto da Barra - SSA PB 100", when(rand() > 0.5, 'Imprópria').otherwise('Própria'))\
#                .withColumn("Santa Maria - SSA SM 100", when(rand() > 0.5, 'Imprópria').otherwise('Própria'))

#dfCompleto = df1.withColumn('Farol da Barra - SSA FB 100', lit(None))\
#                .withColumn("Farol da Barra - SSA FB 200", lit(None))\
#                .withColumn("Porto da Barra - SSA PB 100", lit(None))\
#                .withColumn("Santa Maria - SSA SM 100", lit(None))


# Criando a coluna ano #
dfCompleto = dfCompleto.withColumn("Ano", substring(dfCompleto.Inicio_Semana, 1,4))

dfCompleto.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,Farol da Barra - SSA FB 100,Farol da Barra - SSA FB 200,Porto da Barra - SSA PB 100,Santa Maria - SSA SM 100,Ano
0,2000-05-07,,,18,,,,,2000
1,2000-05-14,,,19,,,,,2000
2,2000-05-21,,,20,,,,,2000
3,2000-05-28,,,21,,,,,2000
4,2000-06-04,,,22,,,,,2000
...,...,...,...,...,...,...,...,...,...
1197,2023-04-16,6.600000,0.942857,15,,,,,2023
1198,2023-04-23,94.800003,13.542858,16,,,,,2023
1199,2023-04-30,49.400002,7.057143,17,,,,,2023
1200,2023-05-07,117.800003,16.828572,18,,,,,2023


In [74]:
inema = spark.read.option("header",True).csv("Data/inema_filtered_balneabilidade_farol_barra.csv")

inema = inema.withColumnRenamed("01/2007","numero_boletim").withColumnRenamed("Farol da Barra - SSA FB 100","ponto_codigo").withColumnRenamed("Indisponível","categoria")

inema.toPandas()

Unnamed: 0,numero_boletim,categoria,ponto_codigo
0,01/2007,Indisponível,Farol da Barra - SSA FB 200
1,01/2007,Indisponível,Porto da Barra - SSA PB 100
2,01/2007,Indisponível,Santa Maria - SSA SM 100
3,02/2007,Indisponível,Farol da Barra - SSA FB 100
4,02/2007,Indisponível,Farol da Barra - SSA FB 200
...,...,...,...
2826,20/2023,Imprópria,Santa Maria - SSA SM 100
2827,21/2023,Imprópria,Farol da Barra - SSA FB 100
2828,21/2023,Imprópria,Farol da Barra - SSA FB 200
2829,21/2023,Própria,Porto da Barra - SSA PB 100


In [75]:
inema = inema.withColumn("Ano",substring(inema.numero_boletim, 4,7)).withColumn("Semana_Ano", substring(inema.numero_boletim, 1,2))

In [76]:
inema.toPandas()

Unnamed: 0,numero_boletim,categoria,ponto_codigo,ano,semana
0,01/2007,Indisponível,Farol da Barra - SSA FB 200,2007,01
1,01/2007,Indisponível,Porto da Barra - SSA PB 100,2007,01
2,01/2007,Indisponível,Santa Maria - SSA SM 100,2007,01
3,02/2007,Indisponível,Farol da Barra - SSA FB 100,2007,02
4,02/2007,Indisponível,Farol da Barra - SSA FB 200,2007,02
...,...,...,...,...,...
2826,20/2023,Imprópria,Santa Maria - SSA SM 100,2023,20
2827,21/2023,Imprópria,Farol da Barra - SSA FB 100,2023,21
2828,21/2023,Imprópria,Farol da Barra - SSA FB 200,2023,21
2829,21/2023,Própria,Porto da Barra - SSA PB 100,2023,21


In [4]:
### Descritiva dos dados ###

dfCompleto.describe().toPandas()

23/07/06 11:29:53 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,summary,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,Farol da Barra - SSA FB 100,Farol da Barra - SSA FB 200,Porto da Barra - SSA PB 100,Santa Maria - SSA SM 100,ano
0,count,1115.0,1115.0,1202.0,1202,1202,1202,1202,1202.0
1,mean,30.80591932758355,4.400845618226224,26.57487520798669,,,,,2011.3569051580696
2,stddev,40.63941721415994,5.80563103059428,15.058303598679691,,,,,6.66529953498471
3,min,0.0,0.0,1.0,Imprópria,Imprópria,Imprópria,Imprópria,2000.0
4,max,364.8,52.1142839704241,53.0,Própria,Própria,Própria,Própria,2023.0


In [5]:
### Contabilizando os Nulls por coluna ###

dfCompleto.select([count(when(isnull(c), c)).alias(c) for c in dfCompleto.columns]).toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,Farol da Barra - SSA FB 100,Farol da Barra - SSA FB 200,Porto da Barra - SSA PB 100,Santa Maria - SSA SM 100,ano
0,0,87,87,0,0,0,0,0,0


In [6]:
### Removendo os Nulls ####

dfCompleto = dfCompleto.replace('?', None).dropna(how='any')

In [7]:
dfCompleto.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,Farol da Barra - SSA FB 100,Farol da Barra - SSA FB 200,Porto da Barra - SSA PB 100,Santa Maria - SSA SM 100,ano
0,2000-10-01,0.000000,0.000000,39,Imprópria,Própria,Própria,Própria,2000
1,2000-10-08,0.200000,0.028571,40,Imprópria,Imprópria,Imprópria,Própria,2000
2,2000-10-15,0.800000,0.114286,41,Própria,Imprópria,Própria,Própria,2000
3,2000-10-22,3.600000,0.514286,42,Própria,Imprópria,Própria,Própria,2000
4,2000-10-29,4.600000,0.657143,43,Própria,Própria,Imprópria,Própria,2000
...,...,...,...,...,...,...,...,...,...
1110,2023-04-16,6.600000,0.942857,15,Própria,Própria,Própria,Própria,2023
1111,2023-04-23,94.800003,13.542858,16,Própria,Imprópria,Imprópria,Própria,2023
1112,2023-04-30,49.400002,7.057143,17,Própria,Própria,Própria,Própria,2023
1113,2023-05-07,117.800003,16.828572,18,Imprópria,Imprópria,Imprópria,Imprópria,2023


In [8]:
### Transformando os valores qualitativos em numéricos => 0 = Própria | 1 = Imprópria | 2 = Indisponível ###

dfCompleto = StringIndexer(
    inputCol='Farol da Barra - SSA FB 100', 
    outputCol='Farol_100', 
    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

dfCompleto = StringIndexer(
    inputCol='Farol da Barra - SSA FB 200', 
    outputCol='Farol_200', 
    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

dfCompleto = StringIndexer(
    inputCol='Porto da Barra - SSA PB 100', 
    outputCol='Porto_100', 
    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

dfCompleto = StringIndexer(
    inputCol='Santa Maria - SSA SM 100', 
    outputCol='Santa Maria_100', 
    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

In [9]:
### Removendo as colunas após transformação ###

dfCompleto = dfCompleto.drop('Farol da Barra - SSA FB 100','Farol da Barra - SSA FB 200','Porto da Barra - SSA PB 100','Santa Maria - SSA SM 100')

In [10]:
dfCompleto.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,ano,Farol_100,Farol_200,Porto_100,Santa Maria_100
0,2000-10-01,0.000000,0.000000,39,2000,1.0,1.0,1.0,0.0
1,2000-10-08,0.200000,0.028571,40,2000,1.0,0.0,0.0,0.0
2,2000-10-15,0.800000,0.114286,41,2000,0.0,0.0,1.0,0.0
3,2000-10-22,3.600000,0.514286,42,2000,0.0,0.0,1.0,0.0
4,2000-10-29,4.600000,0.657143,43,2000,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1110,2023-04-16,6.600000,0.942857,15,2023,0.0,1.0,1.0,0.0
1111,2023-04-23,94.800003,13.542858,16,2023,0.0,0.0,0.0,0.0
1112,2023-04-30,49.400002,7.057143,17,2023,0.0,1.0,1.0,0.0
1113,2023-05-07,117.800003,16.828572,18,2023,1.0,0.0,0.0,1.0


In [11]:
informacoes_necessarias = ['Total de Chuvas(mm)','Média diária de chuvas(mm)']

assembler = VectorAssembler(inputCols=informacoes_necessarias, outputCol='informacoes')

dataset = assembler.transform(dfCompleto)

In [12]:
dataset.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,ano,Farol_100,Farol_200,Porto_100,Santa Maria_100,informacoes
0,2000-10-01,0.000000,0.000000,39,2000,1.0,1.0,1.0,0.0,"(0.0, 0.0)"
1,2000-10-08,0.200000,0.028571,40,2000,1.0,0.0,0.0,0.0,"[0.20000000298023224, 0.028571428997176036]"
2,2000-10-15,0.800000,0.114286,41,2000,0.0,0.0,1.0,0.0,"[0.800000011920929, 0.11428571598870414]"
3,2000-10-22,3.600000,0.514286,42,2000,0.0,0.0,1.0,0.0,"[3.5999999046325684, 0.5142857006617955]"
4,2000-10-29,4.600000,0.657143,43,2000,0.0,1.0,0.0,0.0,"[4.599999904632568, 0.6571428435189384]"
...,...,...,...,...,...,...,...,...,...,...
1110,2023-04-16,6.600000,0.942857,15,2023,0.0,1.0,1.0,0.0,"[6.599999904632568, 0.9428571292332241]"
1111,2023-04-23,94.800003,13.542858,16,2023,0.0,0.0,0.0,0.0,"[94.80000305175781, 13.542857578822545]"
1112,2023-04-30,49.400002,7.057143,17,2023,0.0,1.0,1.0,0.0,"[49.400001525878906, 7.057143075125558]"
1113,2023-05-07,117.800003,16.828572,18,2023,1.0,0.0,0.0,1.0,"[117.80000305175781, 16.82857186453683]"


In [13]:
### Dividindo o dataser em treino (80%) e teste (20%) ###

(treino, teste) = dataset.randomSplit([0.8,0.2])

In [14]:
### Definindo os modelos ###

gbt = GBTClassifier(labelCol="Farol_200", featuresCol="informacoes", maxIter=10)
dt = DecisionTreeClassifier(labelCol='Farol_200',featuresCol='informacoes')
rf = RandomForestClassifier(labelCol='Farol_200',featuresCol='informacoes',maxDepth=5)

In [67]:
### Treinando os modelos #### 

gbtModel = gbt.fit(treino) ### Gradient Boosted Tree Classifier

gbtModel.write().overwrite().save('models/gbt')

##model.write().overwrite().save(basePath + "/model")

rfModel = rf.fit(treino) ### Random Forest Classifier

rfModel.write().overwrite().save('models/rf')

dtModel = dt.fit(treino) ### Decision Tree classfier

dtModel.write().overwrite().save('models/dt')

In [71]:
model2 = RandomForestClassificationModel.load('models/rf')

NameError: name 'RandomForestClassificationModel' is not defined

In [60]:
### Testando os modelos ###

gbtPredicao = gbtModel.transform(teste)

gbtPredicao.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,ano,Farol_100,Farol_200,Porto_100,Santa Maria_100,informacoes,rawPrediction,probability,prediction
0,2000-10-29,4.600000,0.657143,43,2000,0.0,1.0,0.0,0.0,"[4.599999904632568, 0.6571428435189384]","[0.044258656142341445, -0.044258656142341445]","[0.522114890196365, 0.47788510980363497]",0.0
1,2000-12-10,79.599998,11.371428,49,2000,0.0,1.0,1.0,1.0,"[79.5999984741211, 11.37142835344587]","[-0.14414400187956028, 0.14414400187956028]","[0.4284230436576145, 0.5715769563423855]",1.0
2,2001-02-04,11.200000,1.600000,5,2001,0.0,1.0,0.0,0.0,"[11.199999809265137, 1.5999999727521623]","[0.007238707197632764, -0.007238707197632764]","[0.5036192903834474, 0.49638070961655256]",0.0
3,2001-02-25,38.000000,5.428571,8,2001,0.0,1.0,1.0,0.0,"[38.0, 5.428571428571429]","[0.2844274266918683, -0.2844274266918683]","[0.6384988965773151, 0.36150110342268493]",0.0
4,2001-03-25,2.800000,0.400000,12,2001,1.0,1.0,0.0,0.0,"[2.799999952316284, 0.3999999931880406]","[0.11816854185755793, -0.11816854185755793]","[0.5588107846589144, 0.44118921534108557]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,2023-04-09,12.800000,1.828571,14,2023,1.0,0.0,1.0,0.0,"[12.800000190734863, 1.8285714558192663]","[0.0897525890087046, -0.0897525890087046]","[0.5447561807837401, 0.4552438192162599]",0.0
225,2023-04-16,6.600000,0.942857,15,2023,0.0,1.0,1.0,0.0,"[6.599999904632568, 0.9428571292332241]","[-0.024840482656625206, 0.024840482656625206]","[0.48758231267620294, 0.5124176873237971]",1.0
226,2023-04-23,94.800003,13.542858,16,2023,0.0,0.0,0.0,0.0,"[94.80000305175781, 13.542857578822545]","[-0.0012868590224174175, 0.0012868590224174175]","[0.49935657084396545, 0.5006434291560345]",1.0
227,2023-05-07,117.800003,16.828572,18,2023,1.0,0.0,0.0,1.0,"[117.80000305175781, 16.82857186453683]","[-0.1864720442076026, 0.1864720442076026]","[0.40782981802622337, 0.5921701819737766]",1.0


In [17]:
rfPredicao = rfModel.transform(teste)

rfPredicao.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,ano,Farol_100,Farol_200,Porto_100,Santa Maria_100,informacoes,rawPrediction,probability,prediction
0,2000-10-29,4.600000,0.657143,43,2000,0.0,1.0,0.0,0.0,"[4.599999904632568, 0.6571428435189384]","[11.033605078364957, 8.966394921635043, 0.0]","[0.5516802539182478, 0.4483197460817522, 0.0]",0.0
1,2000-12-10,79.599998,11.371428,49,2000,0.0,1.0,1.0,1.0,"[79.5999984741211, 11.37142835344587]","[9.156471726368826, 10.843528273631174, 0.0]","[0.45782358631844133, 0.5421764136815587, 0.0]",1.0
2,2001-02-04,11.200000,1.600000,5,2001,0.0,1.0,0.0,0.0,"[11.199999809265137, 1.5999999727521623]","[9.858967321002918, 10.141032678997082, 0.0]","[0.4929483660501459, 0.5070516339498541, 0.0]",1.0
3,2001-02-25,38.000000,5.428571,8,2001,0.0,1.0,1.0,0.0,"[38.0, 5.428571428571429]","[11.874087674182167, 8.12591232581783, 0.0]","[0.5937043837091085, 0.40629561629089156, 0.0]",0.0
4,2001-03-25,2.800000,0.400000,12,2001,1.0,1.0,0.0,0.0,"[2.799999952316284, 0.3999999931880406]","[10.547000106457288, 9.452999893542712, 0.0]","[0.5273500053228644, 0.4726499946771356, 0.0]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,2023-04-09,12.800000,1.828571,14,2023,1.0,0.0,1.0,0.0,"[12.800000190734863, 1.8285714558192663]","[10.292207715853051, 9.707792284146949, 0.0]","[0.5146103857926525, 0.48538961420734744, 0.0]",0.0
225,2023-04-16,6.600000,0.942857,15,2023,0.0,1.0,1.0,0.0,"[6.599999904632568, 0.9428571292332241]","[9.860137683379408, 10.139862316620592, 0.0]","[0.4930068841689704, 0.5069931158310296, 0.0]",1.0
226,2023-04-23,94.800003,13.542858,16,2023,0.0,0.0,0.0,0.0,"[94.80000305175781, 13.542857578822545]","[9.076895862595912, 10.923104137404088, 0.0]","[0.4538447931297956, 0.5461552068702045, 0.0]",1.0
227,2023-05-07,117.800003,16.828572,18,2023,1.0,0.0,0.0,1.0,"[117.80000305175781, 16.82857186453683]","[8.555926825495217, 11.444073174504785, 0.0]","[0.42779634127476085, 0.5722036587252393, 0.0]",1.0


In [18]:
dtPredicao = dtModel.transform(teste)

dtPredicao.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,ano,Farol_100,Farol_200,Porto_100,Santa Maria_100,informacoes,rawPrediction,probability,prediction
0,2000-10-29,4.600000,0.657143,43,2000,0.0,1.0,0.0,0.0,"[4.599999904632568, 0.6571428435189384]","[120.0, 104.0, 0.0]","[0.5357142857142857, 0.4642857142857143, 0.0]",0.0
1,2000-12-10,79.599998,11.371428,49,2000,0.0,1.0,1.0,1.0,"[79.5999984741211, 11.37142835344587]","[12.0, 16.0, 0.0]","[0.42857142857142855, 0.5714285714285714, 0.0]",1.0
2,2001-02-04,11.200000,1.600000,5,2001,0.0,1.0,0.0,0.0,"[11.199999809265137, 1.5999999727521623]","[120.0, 104.0, 0.0]","[0.5357142857142857, 0.4642857142857143, 0.0]",0.0
3,2001-02-25,38.000000,5.428571,8,2001,0.0,1.0,1.0,0.0,"[38.0, 5.428571428571429]","[136.0, 113.0, 0.0]","[0.5461847389558233, 0.4538152610441767, 0.0]",0.0
4,2001-03-25,2.800000,0.400000,12,2001,1.0,1.0,0.0,0.0,"[2.799999952316284, 0.3999999931880406]","[120.0, 104.0, 0.0]","[0.5357142857142857, 0.4642857142857143, 0.0]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,2023-04-09,12.800000,1.828571,14,2023,1.0,0.0,1.0,0.0,"[12.800000190734863, 1.8285714558192663]","[120.0, 104.0, 0.0]","[0.5357142857142857, 0.4642857142857143, 0.0]",0.0
225,2023-04-16,6.600000,0.942857,15,2023,0.0,1.0,1.0,0.0,"[6.599999904632568, 0.9428571292332241]","[120.0, 104.0, 0.0]","[0.5357142857142857, 0.4642857142857143, 0.0]",0.0
226,2023-04-23,94.800003,13.542858,16,2023,0.0,0.0,0.0,0.0,"[94.80000305175781, 13.542857578822545]","[14.0, 14.0, 0.0]","[0.5, 0.5, 0.0]",0.0
227,2023-05-07,117.800003,16.828572,18,2023,1.0,0.0,0.0,1.0,"[117.80000305175781, 16.82857186453683]","[11.0, 16.0, 0.0]","[0.4074074074074074, 0.5925925925925926, 0.0]",1.0


In [19]:
### Avaliando os modelos ### 

### Definindo os avaliadores ###

### Documentação => https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html ###

### https://towardsdatascience.com/the-f1-score-bec2bbc38aa6 ###

acuracia = MulticlassClassificationEvaluator(labelCol='Farol_100',predictionCol='prediction',metricName='accuracy')
f1 = MulticlassClassificationEvaluator(labelCol='Farol_100',predictionCol='prediction',metricName='f1')
precisaoPonderada = MulticlassClassificationEvaluator(labelCol='Farol_100',predictionCol='prediction',metricName='weightedPrecision')
weightedRecall = MulticlassClassificationEvaluator(labelCol='Farol_100',predictionCol='prediction',metricName='weightedRecall')

### Resultados da Acurácia ###

gbtAcuracia = acuracia.evaluate(gbtPredicao)
rfAcuracia = acuracia.evaluate(rfPredicao)
dtAcuracia = acuracia.evaluate(dtPredicao)
print('Acurácia do teste Árvore de Decisão (Gradiente Boosting) = ', gbtAcuracia)
print('Acurácia do teste Árvore Aleatória = ', rfAcuracia)
print('Acurácia do teste Árvore de Decisão = ', dtAcuracia)

Acurácia do teste Árvore de Decisão (Gradiente Boosting) =  0.519650655021834
Acurácia do teste Árvore Aleatória =  0.5502183406113537
Acurácia do teste Árvore de Decisão =  0.5458515283842795


In [20]:
### Resultados do F1 ###

gbtF1 = f1.evaluate(gbtPredicao)
rfF1 = f1.evaluate(rfPredicao)
dtF1 = f1.evaluate(dtPredicao)
print('F1 do teste Árvore de Decisão (Gradiente Boosting) = ', gbtF1)
print('F1 do teste Árvore Aleatória = ', rfF1)
print('F1 do teste Árvore de Decisão = ', dtF1)

F1 do teste Árvore de Decisão (Gradiente Boosting) =  0.5207545680254609
F1 do teste Árvore Aleatória =  0.5509950612465586
F1 do teste Árvore de Decisão =  0.5381315788746827


In [21]:
### Resultados do Precisão Ponderada ###

gbtPP = precisaoPonderada.evaluate(gbtPredicao)
rfPP = precisaoPonderada.evaluate(rfPredicao)
dtPP = precisaoPonderada.evaluate(dtPredicao)
print('Precisão Ponderada do teste Árvore de Decisão (Gradiente Boosting) = ', gbtPP)
print('Precisão Ponderada do teste Árvore Aleatória = ', rfPP)
print('Precisão Ponderada do teste Árvore de Decisão = ', dtPP)

Precisão Ponderada do teste Árvore de Decisão (Gradiente Boosting) =  0.5231206337712148
Precisão Ponderada do teste Árvore Aleatória =  0.5522934786531452
Precisão Ponderada do teste Árvore de Decisão =  0.5387452784158107


In [22]:
### Resultados do weightedRecall ###

### Recall é a razão entre o número de positivos verdadeiros (pv) e a soma dos positivos verdadeiros (pv) e falsos negativos (fn) => pv/(pv+fn)

gbtWR = weightedRecall.evaluate(gbtPredicao)
rfWR = weightedRecall.evaluate(rfPredicao)
dtWR = weightedRecall.evaluate(dtPredicao)
print('Recall do teste Árvore de Decisão (Gradiente Boosting) = ', gbtWR)
print('Recall do teste Árvore Aleatória = ', rfWR)
print('Recall do teste Árvore de Decisão = ', dtWR)

Recall do teste Árvore de Decisão (Gradiente Boosting) =  0.519650655021834
Recall do teste Árvore Aleatória =  0.5502183406113537
Recall do teste Árvore de Decisão =  0.5458515283842795
