In [1]:
from datetime import datetime, date
import pandas as pd
import plotly.express as px

from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler

### Modelos utilizados ###

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.getOrCreate()

df = spark.read.option("delimiter", ";").option("header", True).csv('Data/inmet_filtered_A401_H_2000-05-12_2023-05-16.csv')

dfTransformado = df.withColumn('PRECIPITACAO TOTAL, HORARIO(mm)', regexp_replace('PRECIPITACAO TOTAL, HORARIO(mm)', ',', '.').cast(DoubleType()))

dfTransformado = dfTransformado.drop(
           'PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)',
           'PRESSAO ATMOSFERICA REDUZIDA NIVEL DO MAR, AUT(mB)',
           'PRESSAO ATMOSFERICA MAX.NA HORA ANT. (AUT)(mB)',
           'PRESSAO ATMOSFERICA MIN. NA HORA ANT. (AUT)(mB)',
           'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA(mB)',
           'TEMPERATURA DA CPU DA ESTACAO(°C)',
           'TEMPERATURA DO AR - BULBO SECO, HORARIA(°C)',
           'TEMPERATURA DO PONTO DE ORVALHO(°C)',
           'TEMPERATURA MAXIMA NA HORA ANT. (AUT)(°C)',
           'TEMPERATURA MINIMA NA HORA ANT. (AUT)(°C)',
           'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT)(°C)',
           'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT)(°C)',
           'TENSAO DA BATERIA DA ESTACAO(V)',
           'UMIDADE REL. MAX. NA HORA ANT. (AUT)(%)',
           'UMIDADE REL. MIN. NA HORA ANT. (AUT)(%)',
           'UMIDADE RELATIVA DO AR, HORARIA(%)',
           'VENTO, DIRECAO HORARIA (gr)(° (gr))',
           'VENTO, RAJADA MAXIMA(m/s)',
           'VENTO, VELOCIDADE HORARIA(m/s)',
           'Unnamed: 22',
           '_c22'
)

df = dfTransformado.withColumn("Inicio_Semana",date_sub(next_day(col("Data Medicao"),"sunday"),7))\
                    .groupBy("Inicio_Semana").agg\
                        (sum("PRECIPITACAO TOTAL, HORARIO(mm)").cast("float").alias("Total de Chuvas(mm)"),\
                         sum("PRECIPITACAO TOTAL, HORARIO(mm)").cast("float").alias("Média diária de chuvas(mm)"))\
                    .orderBy("Inicio_Semana")

df = df.select('Inicio_Semana','Total de Chuvas(mm)',col('Média diária de chuvas(mm)')/ 7 )

df = df.withColumnRenamed("(Média diária de chuvas(mm) / 7)","Média diária de chuvas(mm)")

dfCompleto = df.withColumn('Semana_Ano',weekofyear(df.Inicio_Semana))
dfCompleto.toPandas()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/06 20:51:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano
0,2000-05-07,,,18
1,2000-05-14,,,19
2,2000-05-21,,,20
3,2000-05-28,,,21
4,2000-06-04,,,22
...,...,...,...,...
1197,2023-04-16,6.600000,0.942857,15
1198,2023-04-23,94.800003,13.542858,16
1199,2023-04-30,49.400002,7.057143,17
1200,2023-05-07,117.800003,16.828572,18


In [2]:
#Farol da Barra - SSA FB 100 | Farol da Barra - SSA FB 200 | Porto da Barra - SSA PB 100 | Santa Maria - SSA SM 100

#dfCompleto = df1.withColumn('Farol da Barra - SSA FB 100', when(rand() > 0.5, 'Imprópria').otherwise('Própria'))\
#                .withColumn("Farol da Barra - SSA FB 200", when(rand() > 0.5, 'Imprópria').otherwise('Própria'))\
#                .withColumn("Porto da Barra - SSA PB 100", when(rand() > 0.5, 'Imprópria').otherwise('Própria'))\
#                .withColumn("Santa Maria - SSA SM 100", when(rand() > 0.5, 'Imprópria').otherwise('Própria'))

#dfCompleto = df1.withColumn('Farol da Barra - SSA FB 100', lit(None))\
#                .withColumn("Farol da Barra - SSA FB 200", lit(None))\
#                .withColumn("Porto da Barra - SSA PB 100", lit(None))\
#                .withColumn("Santa Maria - SSA SM 100", lit(None))


# Criando a coluna ano #
dfCompleto = dfCompleto.withColumn("Ano", substring(dfCompleto.Inicio_Semana, 1,4))

dfCompleto.toPandas()

                                                                                

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),Semana_Ano,Ano
0,2000-05-07,,,18,2000
1,2000-05-14,,,19,2000
2,2000-05-21,,,20,2000
3,2000-05-28,,,21,2000
4,2000-06-04,,,22,2000
...,...,...,...,...,...
1197,2023-04-16,6.600000,0.942857,15,2023
1198,2023-04-23,94.800003,13.542858,16,2023
1199,2023-04-30,49.400002,7.057143,17,2023
1200,2023-05-07,117.800003,16.828572,18,2023


In [3]:
inema = spark.read.option("header",True).csv("Data/inema_filtered_balneabilidade_farol_barra.csv")

inema = inema.withColumnRenamed("01/2007","numero_boletim").withColumnRenamed("Farol da Barra - SSA FB 100","ponto_codigo").withColumnRenamed("Indisponível","categoria")

inema = inema.withColumn("Ano",substring(inema.numero_boletim, 4,7)).withColumn("Semana_Ano", substring(inema.numero_boletim, 1,2))

inema.toPandas()

Unnamed: 0,numero_boletim,categoria,ponto_codigo,Ano,Semana_Ano
0,01/2007,Indisponível,Farol da Barra - SSA FB 200,2007,01
1,01/2007,Indisponível,Porto da Barra - SSA PB 100,2007,01
2,01/2007,Indisponível,Santa Maria - SSA SM 100,2007,01
3,02/2007,Indisponível,Farol da Barra - SSA FB 100,2007,02
4,02/2007,Indisponível,Farol da Barra - SSA FB 200,2007,02
...,...,...,...,...,...
2826,20/2023,Imprópria,Santa Maria - SSA SM 100,2023,20
2827,21/2023,Imprópria,Farol da Barra - SSA FB 100,2023,21
2828,21/2023,Imprópria,Farol da Barra - SSA FB 200,2023,21
2829,21/2023,Própria,Porto da Barra - SSA PB 100,2023,21


In [4]:
dfCompleto = dfCompleto.join(inema,["Ano","Semana_Ano"])

dfCompleto = dfCompleto.drop("numero_boletim","Ano","Semana_Ano")

dfCompleto.toPandas()

                                                                                

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),categoria,ponto_codigo
0,2008-01-20,,,Indisponível,Santa Maria - SSA SM 100
1,2008-01-20,,,Indisponível,Porto da Barra - SSA PB 100
2,2008-01-20,,,Indisponível,Farol da Barra - SSA FB 200
3,2008-01-20,,,Indisponível,Farol da Barra - SSA FB 100
4,2008-08-10,17.400000,2.485714,Indisponível,Santa Maria - SSA SM 100
...,...,...,...,...,...
2738,2022-10-02,0.200000,0.028571,Própria,Farol da Barra - SSA FB 100
2739,2022-04-17,274.399994,39.199999,Indisponível,Santa Maria - SSA SM 100
2740,2022-04-17,274.399994,39.199999,Indisponível,Porto da Barra - SSA PB 100
2741,2022-04-17,274.399994,39.199999,Indisponível,Farol da Barra - SSA FB 200


In [5]:
### Descritiva dos dados ###

dfCompleto.describe().toPandas()

23/07/06 20:51:39 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,summary,Total de Chuvas(mm),Média diária de chuvas(mm),categoria,ponto_codigo
0,count,2631.0,2631.0,2743,2743
1,mean,29.430330725370457,4.204332960767204,,
2,stddev,36.37486935212821,5.196409907446881,,
3,min,0.0,0.0,Imprópria,Farol da Barra - SSA FB 100
4,max,299.6,42.800000871930806,Própria,Santa Maria - SSA SM 100


In [6]:
### Contabilizando os Nulls por coluna ###

dfCompleto.select([count(when(isnull(c), c)).alias(c) for c in dfCompleto.columns]).toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),categoria,ponto_codigo
0,0,112,112,0,0


In [7]:
### Removendo os Nulls ####

dfCompleto = dfCompleto.replace('?', None).dropna(how='any')

In [8]:
dfCompleto.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),categoria,ponto_codigo
0,2008-08-10,17.400000,2.485714,Indisponível,Santa Maria - SSA SM 100
1,2008-08-10,17.400000,2.485714,Indisponível,Porto da Barra - SSA PB 100
2,2008-08-10,17.400000,2.485714,Indisponível,Farol da Barra - SSA FB 200
3,2008-08-10,17.400000,2.485714,Indisponível,Farol da Barra - SSA FB 100
4,2007-08-05,22.400000,3.200000,Indisponível,Santa Maria - SSA SM 100
...,...,...,...,...,...
2626,2022-10-02,0.200000,0.028571,Própria,Farol da Barra - SSA FB 100
2627,2022-04-17,274.399994,39.199999,Indisponível,Santa Maria - SSA SM 100
2628,2022-04-17,274.399994,39.199999,Indisponível,Porto da Barra - SSA PB 100
2629,2022-04-17,274.399994,39.199999,Indisponível,Farol da Barra - SSA FB 200


In [9]:
### Transformando os valores qualitativos em numéricos => 0 = Própria | 1 = Imprópria | 2 = Indisponível ###

dfCompleto = StringIndexer(
    inputCol='categoria', 
    outputCol='Categoria_Indexada', 
    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

#dfCompleto = StringIndexer(
#    inputCol='Farol da Barra - SSA FB 200', 
#    outputCol='Farol_200', 
#    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

#dfCompleto = StringIndexer(
#    inputCol='Porto da Barra - SSA PB 100', 
#    outputCol='Porto_100', 
#    handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

#dfCompleto = StringIndexer(
#    inputCol='Santa Maria - SSA SM 100', 
#   outputCol='Santa Maria_100', 
#   handleInvalid='keep').fit(dfCompleto).transform(dfCompleto)

In [10]:
### Removendo a coluna após transformação ###

dfCompleto = dfCompleto.drop('categoria')

dfCompleto = dfCompleto.filter(dfCompleto.Categoria_Indexada < 2)

dfCompleto.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada
0,2007-07-08,17.0,2.428571,Santa Maria - SSA SM 100,0.0
1,2007-07-08,17.0,2.428571,Porto da Barra - SSA PB 100,0.0
2,2007-07-08,17.0,2.428571,Farol da Barra - SSA FB 200,0.0
3,2007-07-08,17.0,2.428571,Farol da Barra - SSA FB 100,0.0
4,2008-06-01,0.2,0.028571,Santa Maria - SSA SM 100,0.0
...,...,...,...,...,...
2234,2023-02-26,17.0,2.428571,Farol da Barra - SSA FB 100,0.0
2235,2022-10-02,0.2,0.028571,Santa Maria - SSA SM 100,1.0
2236,2022-10-02,0.2,0.028571,Porto da Barra - SSA PB 100,0.0
2237,2022-10-02,0.2,0.028571,Farol da Barra - SSA FB 200,0.0


In [11]:
dfCompleto.toPandas()

Porto100 = dfCompleto.filter(dfCompleto.ponto_codigo == "Porto da Barra - SSA PB 100")
Farol100 = dfCompleto.filter(dfCompleto.ponto_codigo == "Farol da Barra - SSA FB 100")
Farol200 = dfCompleto.filter(dfCompleto.ponto_codigo == "Farol da Barra - SSA FB 200")
SM100 = dfCompleto.filter(dfCompleto.ponto_codigo == "Santa Maria - SSA SM 100")

In [12]:
Porto100.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada
0,2007-07-08,17.000000,2.428571,Porto da Barra - SSA PB 100,0.0
1,2008-06-01,0.200000,0.028571,Porto da Barra - SSA PB 100,0.0
2,2008-06-22,58.799999,8.400000,Porto da Barra - SSA PB 100,0.0
3,2008-07-20,25.400000,3.628571,Porto da Barra - SSA PB 100,0.0
4,2008-08-17,35.200001,5.028572,Porto da Barra - SSA PB 100,0.0
...,...,...,...,...,...
558,2023-04-23,94.800003,13.542858,Porto da Barra - SSA PB 100,1.0
559,2023-01-08,3.800000,0.542857,Porto da Barra - SSA PB 100,1.0
560,2022-01-02,9.600000,1.371429,Porto da Barra - SSA PB 100,0.0
561,2023-02-26,17.000000,2.428571,Porto da Barra - SSA PB 100,0.0


In [13]:
informacoes_necessarias = ['Total de Chuvas(mm)','Média diária de chuvas(mm)']

assembler = VectorAssembler(inputCols=informacoes_necessarias, outputCol='informacoes')

dfPorto100 = assembler.transform(Porto100)

dfFarol100 = assembler.transform(Farol100)

dfFarol200 = assembler.transform(Farol200)

dfSM100 = assembler.transform(SM100)


In [30]:
dfFarol200.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada,informacoes
0,2007-07-08,17.000000,2.428571,Farol da Barra - SSA FB 200,0.0,"[17.0, 2.4285714285714284]"
1,2008-06-01,0.200000,0.028571,Farol da Barra - SSA FB 200,0.0,"[0.20000000298023224, 0.028571428997176036]"
2,2008-06-22,58.799999,8.400000,Farol da Barra - SSA FB 200,0.0,"[58.79999923706055, 8.39999989100865]"
3,2008-07-20,25.400000,3.628571,Farol da Barra - SSA FB 200,0.0,"[25.399999618530273, 3.6285713740757535]"
4,2008-08-17,35.200001,5.028572,Farol da Barra - SSA FB 200,0.0,"[35.20000076293945, 5.028571537562779]"
...,...,...,...,...,...,...
554,2023-04-23,94.800003,13.542858,Farol da Barra - SSA FB 200,1.0,"[94.80000305175781, 13.542857578822545]"
555,2023-01-08,3.800000,0.542857,Farol da Barra - SSA FB 200,1.0,"[3.799999952316284, 0.5428571360451835]"
556,2022-01-02,9.600000,1.371429,Farol da Barra - SSA FB 200,0.0,"[9.600000381469727, 1.3714286259242467]"
557,2023-02-26,17.000000,2.428571,Farol da Barra - SSA FB 200,1.0,"[17.0, 2.4285714285714284]"


In [15]:
### Dividindo o dataser em treino (80%) e teste (20%) ###

(treinoFarol200, testeFarol200) = dfFarol200.randomSplit([0.8,0.2])

treinoFarol200.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada,informacoes
0,2007-02-04,61.599998,8.800000,Farol da Barra - SSA FB 200,0.0,"[61.599998474121094, 8.799999782017299]"
1,2007-02-11,76.199997,10.885714,Farol da Barra - SSA FB 200,0.0,"[76.19999694824219, 10.885713849748884]"
2,2007-07-01,44.400002,6.342857,Farol da Barra - SSA FB 200,0.0,"[44.400001525878906, 6.342857360839844]"
3,2007-07-15,3.200000,0.457143,Farol da Barra - SSA FB 200,0.0,"[3.200000047683716, 0.45714286395481657]"
4,2007-07-22,24.000000,3.428571,Farol da Barra - SSA FB 200,0.0,"[24.0, 3.4285714285714284]"
...,...,...,...,...,...,...
462,2023-04-16,6.600000,0.942857,Farol da Barra - SSA FB 200,1.0,"[6.599999904632568, 0.9428571292332241]"
463,2023-04-23,94.800003,13.542858,Farol da Barra - SSA FB 200,1.0,"[94.80000305175781, 13.542857578822545]"
464,2023-04-30,49.400002,7.057143,Farol da Barra - SSA FB 200,1.0,"[49.400001525878906, 7.057143075125558]"
465,2023-05-07,117.800003,16.828572,Farol da Barra - SSA FB 200,1.0,"[117.80000305175781, 16.82857186453683]"


In [16]:
### Definindo os modelos ###

gbt = GBTClassifier(labelCol="Categoria_Indexada", featuresCol="informacoes", maxIter=10)
dt = DecisionTreeClassifier(labelCol='Categoria_Indexada',featuresCol='informacoes')
rf = RandomForestClassifier(labelCol='Categoria_Indexada',featuresCol='informacoes',maxDepth=5)

In [17]:
### Treinando os modelos #### 

gbtModel = gbt.fit(treinoFarol200) ### Gradient Boosted Tree Classifier

rfModel = rf.fit(treinoFarol200) ### Random Forest Classifier

dtModel = dt.fit(treinoFarol200) ### Decision Tree classfier

rfModel.write().overwrite().save('models/rf')

gbtModel.write().overwrite().save('models/gbt')

dtModel.write().overwrite().save('models/dt')

In [18]:
### Testando os modelos ###

gbtPredicao = gbtModel.transform(testeFarol200)

gbtPredicao.toPandas()

23/07/06 20:52:00 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/07/06 20:52:00 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada,informacoes,rawPrediction,probability,prediction
0,2007-07-08,17.000000,2.428571,Farol da Barra - SSA FB 200,0.0,"[17.0, 2.4285714285714284]","[0.5852166773011103, -0.5852166773011103]","[0.7632233376755987, 0.2367766623244013]",0.0
1,2008-06-15,3.200000,0.457143,Farol da Barra - SSA FB 200,0.0,"[3.200000047683716, 0.45714286395481657]","[0.40849402868492457, -0.40849402868492457]","[0.6935966145123624, 0.30640338548763757]",0.0
2,2008-07-06,67.199997,9.600000,Farol da Barra - SSA FB 200,0.0,"[67.19999694824219, 9.599999564034599]","[0.16653843330687676, -0.16653843330687676]","[0.582507837010796, 0.417492162989204]",0.0
3,2008-08-24,0.200000,0.028571,Farol da Barra - SSA FB 200,0.0,"[0.20000000298023224, 0.028571428997176036]","[0.44203100984633187, -0.44203100984633187]","[0.7076632636434069, 0.2923367363565931]",0.0
4,2009-01-18,0.000000,0.000000,Farol da Barra - SSA FB 200,0.0,"(0.0, 0.0)","[0.47773457684624593, -0.47773457684624593]","[0.7222137377697347, 0.27778626223026526]",0.0
...,...,...,...,...,...,...,...,...,...
87,2022-10-23,44.799999,6.400000,Farol da Barra - SSA FB 200,0.0,"[44.79999923706055, 6.399999891008649]","[-0.3229394554890182, 0.3229394554890182]","[0.3439188152972356, 0.6560811847027643]",1.0
88,2022-11-20,66.199997,9.457142,Farol da Barra - SSA FB 200,1.0,"[66.19999694824219, 9.457142421177455]","[0.16653843330687676, -0.16653843330687676]","[0.582507837010796, 0.417492162989204]",0.0
89,2023-01-15,25.000000,3.571429,Farol da Barra - SSA FB 200,0.0,"[25.0, 3.5714285714285716]","[0.5076503047526736, -0.5076503047526736]","[0.7340562049445227, 0.2659437950554773]",0.0
90,2023-02-19,25.799999,3.685714,Farol da Barra - SSA FB 200,1.0,"[25.799999237060547, 3.6857141767229353]","[0.5076503047526736, -0.5076503047526736]","[0.7340562049445227, 0.2659437950554773]",0.0


In [19]:
rfPredicao = rfModel.transform(testeFarol200)

rfPredicao.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada,informacoes,rawPrediction,probability,prediction
0,2007-07-08,17.000000,2.428571,Farol da Barra - SSA FB 200,0.0,"[17.0, 2.4285714285714284]","[14.647481176977825, 5.352518823022176, 0.0, 0.0]","[0.7323740588488913, 0.26762594115110877, 0.0,...",0.0
1,2008-06-15,3.200000,0.457143,Farol da Barra - SSA FB 200,0.0,"[3.200000047683716, 0.45714286395481657]","[13.499801447116269, 6.5001985528837345, 0.0, ...","[0.6749900723558133, 0.3250099276441867, 0.0, ...",0.0
2,2008-07-06,67.199997,9.600000,Farol da Barra - SSA FB 200,0.0,"[67.19999694824219, 9.599999564034599]","[12.240759397677438, 7.759240602322566, 0.0, 0.0]","[0.6120379698838718, 0.38796203011612823, 0.0,...",0.0
3,2008-08-24,0.200000,0.028571,Farol da Barra - SSA FB 200,0.0,"[0.20000000298023224, 0.028571428997176036]","[14.349050444159499, 5.650949555840502, 0.0, 0.0]","[0.717452522207975, 0.2825474777920251, 0.0, 0.0]",0.0
4,2009-01-18,0.000000,0.000000,Farol da Barra - SSA FB 200,0.0,"(0.0, 0.0)","[14.531742751851807, 5.468257248148194, 0.0, 0.0]","[0.7265871375925903, 0.2734128624074097, 0.0, ...",0.0
...,...,...,...,...,...,...,...,...,...
87,2022-10-23,44.799999,6.400000,Farol da Barra - SSA FB 200,0.0,"[44.79999923706055, 6.399999891008649]","[7.8531168308883395, 12.146883169111659, 0.0, ...","[0.39265584154441696, 0.607344158455583, 0.0, ...",1.0
88,2022-11-20,66.199997,9.457142,Farol da Barra - SSA FB 200,1.0,"[66.19999694824219, 9.457142421177455]","[12.240759397677438, 7.759240602322566, 0.0, 0.0]","[0.6120379698838718, 0.38796203011612823, 0.0,...",0.0
89,2023-01-15,25.000000,3.571429,Farol da Barra - SSA FB 200,0.0,"[25.0, 3.5714285714285716]","[14.870157392213025, 5.129842607786977, 0.0, 0.0]","[0.7435078696106512, 0.2564921303893488, 0.0, ...",0.0
90,2023-02-19,25.799999,3.685714,Farol da Barra - SSA FB 200,1.0,"[25.799999237060547, 3.6857141767229353]","[14.870157392213025, 5.129842607786977, 0.0, 0.0]","[0.7435078696106512, 0.2564921303893488, 0.0, ...",0.0


In [20]:
dtPredicao = dtModel.transform(testeFarol200)

dtPredicao.toPandas()

Unnamed: 0,Inicio_Semana,Total de Chuvas(mm),Média diária de chuvas(mm),ponto_codigo,Categoria_Indexada,informacoes,rawPrediction,probability,prediction
0,2007-07-08,17.000000,2.428571,Farol da Barra - SSA FB 200,0.0,"[17.0, 2.4285714285714284]","[234.0, 86.0, 0.0, 0.0]","[0.73125, 0.26875, 0.0, 0.0]",0.0
1,2008-06-15,3.200000,0.457143,Farol da Barra - SSA FB 200,0.0,"[3.200000047683716, 0.45714286395481657]","[234.0, 86.0, 0.0, 0.0]","[0.73125, 0.26875, 0.0, 0.0]",0.0
2,2008-07-06,67.199997,9.600000,Farol da Barra - SSA FB 200,0.0,"[67.19999694824219, 9.599999564034599]","[67.0, 35.0, 0.0, 0.0]","[0.6568627450980392, 0.3431372549019608, 0.0, ...",0.0
3,2008-08-24,0.200000,0.028571,Farol da Barra - SSA FB 200,0.0,"[0.20000000298023224, 0.028571428997176036]","[234.0, 86.0, 0.0, 0.0]","[0.73125, 0.26875, 0.0, 0.0]",0.0
4,2009-01-18,0.000000,0.000000,Farol da Barra - SSA FB 200,0.0,"(0.0, 0.0)","[234.0, 86.0, 0.0, 0.0]","[0.73125, 0.26875, 0.0, 0.0]",0.0
...,...,...,...,...,...,...,...,...,...
87,2022-10-23,44.799999,6.400000,Farol da Barra - SSA FB 200,0.0,"[44.79999923706055, 6.399999891008649]","[5.0, 10.0, 0.0, 0.0]","[0.3333333333333333, 0.6666666666666666, 0.0, ...",1.0
88,2022-11-20,66.199997,9.457142,Farol da Barra - SSA FB 200,1.0,"[66.19999694824219, 9.457142421177455]","[67.0, 35.0, 0.0, 0.0]","[0.6568627450980392, 0.3431372549019608, 0.0, ...",0.0
89,2023-01-15,25.000000,3.571429,Farol da Barra - SSA FB 200,0.0,"[25.0, 3.5714285714285716]","[234.0, 86.0, 0.0, 0.0]","[0.73125, 0.26875, 0.0, 0.0]",0.0
90,2023-02-19,25.799999,3.685714,Farol da Barra - SSA FB 200,1.0,"[25.799999237060547, 3.6857141767229353]","[234.0, 86.0, 0.0, 0.0]","[0.73125, 0.26875, 0.0, 0.0]",0.0


In [21]:
### Avaliando os modelos ### 

### Definindo os avaliadores ###

### Documentação => https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html ###

### https://towardsdatascience.com/the-f1-score-bec2bbc38aa6 ###

acuracia = MulticlassClassificationEvaluator(labelCol='Categoria_Indexada',predictionCol='prediction',metricName='accuracy')
f1 = MulticlassClassificationEvaluator(labelCol='Categoria_Indexada',predictionCol='prediction',metricName='f1')
precisaoPonderada = MulticlassClassificationEvaluator(labelCol='Categoria_Indexada',predictionCol='prediction',metricName='weightedPrecision')
weightedRecall = MulticlassClassificationEvaluator(labelCol='Categoria_Indexada',predictionCol='prediction',metricName='weightedRecall')

### Resultados da Acurácia ###

gbtAcuracia = acuracia.evaluate(gbtPredicao)
rfAcuracia = acuracia.evaluate(rfPredicao)
dtAcuracia = acuracia.evaluate(dtPredicao)
print('Acurácia do teste Árvore de Decisão (Gradiente Boosting) = ', gbtAcuracia)
print('Acurácia do teste Árvore Aleatória = ', rfAcuracia)
print('Acurácia do teste Árvore de Decisão = ', dtAcuracia)

Acurácia do teste Árvore de Decisão (Gradiente Boosting) =  0.6739130434782609
Acurácia do teste Árvore Aleatória =  0.6739130434782609
Acurácia do teste Árvore de Decisão =  0.6739130434782609


In [22]:
### Resultados do F1 ###

gbtF1 = f1.evaluate(gbtPredicao)
rfF1 = f1.evaluate(rfPredicao)
dtF1 = f1.evaluate(dtPredicao)
print('F1 do teste Árvore de Decisão (Gradiente Boosting) = ', gbtF1)
print('F1 do teste Árvore Aleatória = ', rfF1)
print('F1 do teste Árvore de Decisão = ', dtF1)

F1 do teste Árvore de Decisão (Gradiente Boosting) =  0.5601355166572557
F1 do teste Árvore Aleatória =  0.5601355166572557
F1 do teste Árvore de Decisão =  0.5601355166572557


In [23]:
### Resultados do Precisão Ponderada ###

gbtPP = precisaoPonderada.evaluate(gbtPredicao)
rfPP = precisaoPonderada.evaluate(rfPredicao)
dtPP = precisaoPonderada.evaluate(dtPredicao)
print('Precisão Ponderada do teste Árvore de Decisão (Gradiente Boosting) = ', gbtPP)
print('Precisão Ponderada do teste Árvore Aleatória = ', rfPP)
print('Precisão Ponderada do teste Árvore de Decisão = ', dtPP)

Precisão Ponderada do teste Árvore de Decisão (Gradiente Boosting) =  0.4792270531400966
Precisão Ponderada do teste Árvore Aleatória =  0.4792270531400966
Precisão Ponderada do teste Árvore de Decisão =  0.4792270531400966


In [24]:
### Resultados do weightedRecall ###

### Recall é a razão entre o número de positivos verdadeiros (pv) e a soma dos positivos verdadeiros (pv) e falsos negativos (fn) => pv/(pv+fn)

gbtWR = weightedRecall.evaluate(gbtPredicao)
rfWR = weightedRecall.evaluate(rfPredicao)
dtWR = weightedRecall.evaluate(dtPredicao)
print('Recall do teste Árvore de Decisão (Gradiente Boosting) = ', gbtWR)
print('Recall do teste Árvore Aleatória = ', rfWR)
print('Recall do teste Árvore de Decisão = ', dtWR)

Recall do teste Árvore de Decisão (Gradiente Boosting) =  0.6739130434782609
Recall do teste Árvore Aleatória =  0.6739130434782609
Recall do teste Árvore de Decisão =  0.6739130434782609


In [25]:
from pyspark.ml.classification import RandomForestClassificationModel

rfModel = RandomForestClassificationModel.load("models/rf")

In [26]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType

data2 = [
    (60.50,8.0),
  ]

schema = StructType([ \
    StructField("Total de Chuvas(mm)",FloatType(),True), \
    StructField("Média diária de chuvas(mm)",FloatType(),True), \
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
df.toPandas()

                                                                                

Unnamed: 0,Total de Chuvas(mm),Média diária de chuvas(mm)
0,60.5,8.0


In [27]:
informacoes_necessarias = ['Total de Chuvas(mm)','Média diária de chuvas(mm)']
assembler = VectorAssembler(inputCols=informacoes_necessarias, outputCol='informacoes')
df = assembler.transform(df)

teste = rfModel.transform(df)

df.toPandas()

Unnamed: 0,Total de Chuvas(mm),Média diária de chuvas(mm),informacoes
0,60.5,8.0,"[60.5, 8.0]"


In [28]:
teste = rfModel.transform(df)

teste.toPandas()

Unnamed: 0,Total de Chuvas(mm),Média diária de chuvas(mm),informacoes,rawPrediction,probability,prediction
0,60.5,8.0,"[60.5, 8.0]","[13.609434295118515, 6.390565704881484, 0.0, 0.0]","[0.6804717147559257, 0.31952828524407423, 0.0,...",0.0


In [29]:
saida = (teste.first()['prediction'])

if saida == 0.0 :
    print("A previsão é", saida, ": Própria")
else:
    print("A previsão é", saida, ": Imprópria")

A previsão é 0.0 : Própria
