In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
hdfs_path = "hdfs://hdfs-nn:9000/AreasVerdes/bronze/arvores_csv/arvores2015.csv"

In [4]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
customSchema = StructType([
    StructField("year", StringType(), True),        
    StructField("tree_id", IntegerType(), True),
    StructField("block_id", IntegerType(), True),
    StructField("the_geom", StringType(), True),
    StructField("tree_diameter", IntegerType(), True),
    StructField("stump_diam", IntegerType(), True),
    StructField("curb_loc", StringType(), True),
    StructField("status", StringType(), True),
    StructField("health", StringType(), True),
    StructField("latin_species_name", StringType(), True),
    StructField("comun_species_name", StringType(), True),
    StructField("steward", StringType(), True),
    StructField("guards", StringType(), True),
    StructField("sidewalk", StringType(), True),
    StructField("user_type", StringType(), True),
    StructField("problems", StringType(), True),
    StructField("problems_root_stone", StringType(), True),
    StructField("problems_root_grate", StringType(), True),
    StructField("problems_root_other", StringType(), True),
    StructField("problems_trunk_wire", StringType(), True),
    StructField("problems_trunk_light", StringType(), True),
    StructField("problems_trunk_other", StringType(), True),
    StructField("problems_branch_light", StringType(), True),
    StructField("problems_branch_shoe", StringType(), True),
    StructField("problems_branch_other", StringType(), True),
    StructField("address", StringType(), True),
    StructField("zipcode", IntegerType(), True),
    StructField("zip_city", StringType(), True),
    StructField("cb_num", IntegerType(), True),
    StructField("borocode", IntegerType(), True),
    StructField("boroname", StringType(), True),
    StructField("cncldist", IntegerType(), True),
    StructField("ct_assem", IntegerType(), True),
     StructField("ct_senate", IntegerType(), True),
     StructField("nta", StringType(), True),
     StructField("nta_name", StringType(), True),
     StructField("boro_ct", IntegerType(), True),
     StructField("state", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("x_sp", StringType(), True),
    StructField("y_sp", StringType(), True)
])

episodes = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)

In [5]:
# Escolho as colunas que quero
arvores2015 = episodes.select("tree_id","year","address","tree_diameter","health","latin_species_name","comun_species_name","boroname","borocode","state","sidewalk","problems_trunk_light","problems_trunk_wire","problems_trunk_other","problems_root_stone","problems_root_grate","problems_root_other","problems_branch_light","problems_branch_shoe","problems_branch_other","user_type","status")

In [6]:
# alterei todos os dados da coluna ano para 2015
arvores2015 = arvores2015.withColumn("year", lit(2015))


In [7]:
#Cria a coluna Problems_Trunk_general
arvores2015 = arvores2015.withColumn("problems_trunk_general", lit("No"))
arvores2015 = arvores2015.withColumn("problems_trunk_general", when(arvores2015.problems_trunk_wire == "Yes","Yes") \
      .when(arvores2015.problems_trunk_light == "Yes","Yes") \
      .when(arvores2015.problems_trunk_other == "Yes","Yes") \
      .otherwise(arvores2015.problems_trunk_general))

In [8]:
#Cria a coluna Problems_wires_general
arvores2015 = arvores2015.withColumn("problems_wires_general", lit("No"))
arvores2015 = arvores2015.withColumn("problems_wires_general", when(arvores2015.problems_trunk_wire == "Yes","Yes") \
      .otherwise(arvores2015.problems_wires_general))

In [9]:
#Cria a coluna problemas na rama com as linhas todas a No
arvores2015 = arvores2015.withColumn("problems_lights_general", lit("No"))
#arvores2015.show()

In [10]:
#Cria a coluna Problems_lights_general
arvores2015 = arvores2015.withColumn("problems_lights_general", lit("No"))
arvores2015 = arvores2015.withColumn("problems_lights_general", when(arvores2015.problems_trunk_light == "Yes","Yes") \
      .when(arvores2015.problems_branch_light == "Yes","Yes") \
      .otherwise(arvores2015.problems_lights_general))

In [11]:
#Cria a coluna Problems_shoes_general
arvores2015 = arvores2015.withColumn("problems_shoes_general", lit("No"))
arvores2015 = arvores2015.withColumn("problems_shoes_general", when(arvores2015.problems_branch_shoe == "Yes","Yes") \
      .otherwise(arvores2015.problems_shoes_general))

In [12]:
#Cria a coluna Problems_guard_general
arvores2015 = arvores2015.withColumn("problems_guard_general", lit("No"))
arvores2015 = arvores2015.withColumn("problems_guard_general", when(arvores2015.problems_root_grate == "Yes","Yes") \
      .otherwise(arvores2015.problems_guard_general))

In [13]:
#Cria a coluna Problems_outlet_general
arvores2015 = arvores2015.withColumn("problems_outlet_general", lit("Unknown"))


In [14]:
#Cria a coluna Problems_paving_general
arvores2015 = arvores2015.withColumn("problems_paving_general", lit("Unknown"))

In [15]:
#Cria a coluna Problems_canopy_general
arvores2015 = arvores2015.withColumn("problems_canopy_general", lit("Unknown"))

In [16]:
#Healht null/""/None
arvores2015 = arvores2015.withColumn("health", when(arvores2015.health == "","Unknown") \
      .when(arvores2015.health == None,"Unknown") \
      .when(arvores2015.status == "Dead","Dead") \
      .otherwise(arvores2015.health))
arvores2015 = arvores2015.na.fill("Unknown",["health"])

In [17]:
#Sidewalk null/""/None
arvores2015 = arvores2015.withColumn("sidewalk", when(arvores2015.sidewalk == "","Unknown") \
      .when(arvores2015.sidewalk == None,"Unknown") \
      .otherwise(arvores2015.sidewalk))
arvores2015 = arvores2015.na.fill("Unknown",["sidewalk"])

In [18]:
#Latin_Species_Name null/""/None
arvores2015 = arvores2015.withColumn("latin_species_name", when(arvores2015.latin_species_name == "","Unknown") \
      .when(arvores2015.latin_species_name == None,"Unknown") \
      .otherwise(arvores2015.latin_species_name))
arvores2015 = arvores2015.na.fill("Unknown",["latin_species_name"])

In [19]:
#Comun_Species_Name null/""/None
arvores2015 = arvores2015.withColumn("comun_species_name", when(arvores2015.comun_species_name == "","Unknown") \
      .when(arvores2015.comun_species_name == None,"Unknown") \
      .otherwise(arvores2015.comun_species_name))
arvores2015 = arvores2015.na.fill("Unknown",["comun_species_name"])

In [20]:
arvores2015.createOrReplaceTempView("episodes")

sqlized_df = spark.sql(
    """
    SELECT Distinct health
    FROM episodes
    """
)

sqlized_df.show()

+-------+
| health|
+-------+
|   Dead|
|   Good|
|Unknown|
|   Fair|
|   Poor|
+-------+



In [21]:
#Carregar o dataset arvores 2005
hdfs_path2 = "hdfs://hdfs-nn:9000/AreasVerdes/bronze/arvores_csv/arvores2005.csv"
customSchema = StructType([
    StructField("tree_id", IntegerType(), True),        
    StructField("year", IntegerType(), True),
    StructField("tree_diameter", IntegerType(), True),
    StructField("address", StringType(), True),
    StructField("tree_loc", StringType(), True),
    StructField("pit_type", StringType(), True),
    StructField("soil_lvl", StringType(), True),
    StructField("health", StringType(), True),
     StructField("latin_species_name", StringType(), True),
    StructField("comun_species_name", StringType(), True),
    StructField("vert_other", StringType(), True),
    StructField("vert_pgrd", StringType(), True),
    StructField("vert_tgrd", StringType(), True),
    StructField("vert_wall", StringType(), True),
    StructField("Horz_blck", StringType(), True),
    StructField("Horz_grate", StringType(), True),
    StructField("Horz_plant", StringType(), True),
    StructField("Horz_other", StringType(), True),
    StructField("sidw_crack", StringType(), True),
    StructField("sidw_raise", StringType(), True),
    StructField("wire_htap", StringType(), True),
    StructField("wire_prime", StringType(), True),
    StructField("wire_2nd", StringType(), True),
    StructField("wire_other", StringType(), True),
    StructField("problems_canopy_general", StringType(), True),
    StructField("problems_guard_general", StringType(), True),
    StructField("problems_wires_general", StringType(), True),
    StructField("problems_paving_general", StringType(), True),
    StructField("problems_outlet_general", StringType(), True),
    StructField("problems_shoes_general", StringType(), True),
    StructField("problems_lights_general", StringType(), True),
     StructField("problems_other_g", StringType(), True),
    StructField("problems_trunk_general", StringType(), True),
    StructField("zipcode", IntegerType(), True),
    StructField("zip_city", StringType(), True),
    StructField("cb_num", IntegerType(), True),
    StructField("borocode", IntegerType(), True),
    StructField("boroname", StringType(), True),
    StructField("cncldist", IntegerType(), True),
    StructField("ct_assem", IntegerType(), True),
     StructField("ct_senate", IntegerType(), True),
     StructField("nta", StringType(), True),
     StructField("nta_name", StringType(), True),
     StructField("boro_ct", IntegerType(), True),
     StructField("State", StringType(), True),
    StructField("Latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("x_sp", StringType(), True),
    StructField("y_sp", StringType(), True),
     StructField("obectid_1", StringType(), True),
     StructField("census_tract", StringType(), True),
     StructField("bin", StringType(), True),
     StructField("bbl", StringType(), True),
     StructField("locatiom_1", StringType(), True),
])

arvores2005 = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path2)

In [22]:
#Escolho as colunas que quero
arvores2005 = arvores2005.select("tree_id","year","address","tree_diameter","health","latin_species_name","comun_species_name","boroname","borocode","state", "sidw_crack", "sidw_raise","problems_canopy_general","problems_guard_general","problems_wires_general","problems_paving_general","problems_outlet_general","problems_shoes_general","problems_lights_general","problems_trunk_general")

In [23]:
# Coluna Boroname 5= Staten island/ 0 = não temos informação / null = não temos informação
arvores2005 = arvores2005.withColumn("boroname", when(arvores2005.boroname == "0","Unknown") \
      .when(arvores2005.boroname == "5","Staten Island") \
      .otherwise(arvores2005.boroname))
arvores2005 = arvores2005.na.fill("Unknown",["boroname"])

In [24]:
# Borocode null = 0
arvores2005 = arvores2005.na.fill(0,["borocode"])

In [25]:
# Nesta coluna aparecia o ano 2006// como não é relevante para o estudo o ano ser 2005 ou 2006 converti tudo para 2005
arvores2005 = arvores2005.withColumn("year", lit(2005))

In [26]:
# Coluna Health no= Não temos informação/ null = não temos informação
arvores2005 = arvores2005.withColumn("health", when(arvores2005.health == "No","Unknown") \
      .when(arvores2005.health == "Excellent","Good") \
      .otherwise(arvores2005.health))
arvores2005 = arvores2005.na.fill("Unknown",["health"])

In [27]:
# Coluna Latin_Species_Name null = "Não temos informação"
arvores2005 = arvores2005.na.fill("Unknown",["latin_species_name"])

In [28]:
# Coluna Latin_Comun_Name null = "Não temos informação"
arvores2005 = arvores2005.na.fill("Unknown",["comun_species_name"])

In [29]:
# Coluna State null = "New York" / 0 = "New York"
arvores2005 = arvores2005.withColumn("state", when(arvores2005.state == "0","New York") \
      .otherwise(arvores2005.state))
arvores2005 = arvores2005.na.fill("New York",["state"])

In [30]:
#Coluna Sidw_crack + sidw_raise numa sócoluna Sidewalk // Yes = Damage / No = NoDamage
arvores2005 = arvores2005.withColumn("sidewalk", lit("NoDamage"))
arvores2005 = arvores2005.withColumn("sidewalk", when(arvores2005.sidw_crack == "Yes","Damage") \
      .when(arvores2005.sidw_raise == "Yes","Damage") \
      .otherwise(arvores2005.sidewalk))

In [31]:
# Coluna Problems_canopy_general null = "No"
arvores2005 = arvores2005.na.fill("No",["problems_canopy_general"])

In [32]:
# Coluna Problems_guard_general null = "No"
arvores2005 = arvores2005.na.fill("No",["problems_guard_general"])

In [33]:
# Coluna Problems_wires_general null = "No"
arvores2005 = arvores2005.na.fill("No",["problems_wires_general"])

In [34]:
# Coluna Problems_peving_general null = "No"
arvores2005 = arvores2005.na.fill("No",["problems_paving_general"])

In [35]:
# Coluna Problems_outlet_general null = "No" 
arvores2005 = arvores2005.na.fill("No",["problems_outlet_general"])

In [36]:
# Coluna Problems_shoes_general null = "No" // "None" = "No" 
arvores2005 = arvores2005.na.fill("No",["problems_shoes_general"])
arvores2005 = arvores2005.withColumn("problems_shoes_general", when(arvores2005.problems_shoes_general == "Trunk Wound","No") \
      .when(arvores2005.problems_shoes_general == "None","No") \
      .otherwise(arvores2005.problems_shoes_general))

In [37]:
# Coluna Problems_lights_general null = "No" / 0 = "No"
arvores2005 = arvores2005.na.fill("No",["problems_lights_general"])
arvores2005 = arvores2005.withColumn("problems_lights_general", when(arvores2005.problems_lights_general == "0","No") \
      .otherwise(arvores2005.problems_lights_general))

In [38]:
# Coluna Problems_Trunk_general null = "No" // "None" = "No" // Torn Bark == yes // cavity = yes / trunk wound = yes
arvores2005 = arvores2005.na.fill("No",["problems_trunk_general"])
arvores2005 = arvores2005.withColumn("problems_trunk_general", when(arvores2005.problems_trunk_general == "0","No") \
      .when(arvores2005.problems_trunk_general == "None","No") \
      .when(arvores2005.problems_trunk_general == "Torn Bark","Yes") \
      .when(arvores2005.problems_trunk_general == "Cavity","Yes") \
      .when(arvores2005.problems_trunk_general == "Trunk Wound","Yes") \
      .otherwise(arvores2005.problems_trunk_general))

In [39]:
#Cria as colunas que estão presentes nas outras tabelas
arvores2005 = arvores2005.withColumn("problems_trunk_light", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_trunk_wire", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_trunk_other", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_root_stone", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_root_grate", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_root_other", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_branch_light", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_branch_shoe", lit("Unknown"))
arvores2005 = arvores2005.withColumn("problems_branch_other", lit("Unknown"))
arvores2005 = arvores2005.withColumn("user_type", lit("Unknown"))

In [40]:
arvores2005.printSchema()

root
 |-- tree_id: integer (nullable = true)
 |-- year: integer (nullable = false)
 |-- address: string (nullable = true)
 |-- tree_diameter: integer (nullable = true)
 |-- health: string (nullable = false)
 |-- latin_species_name: string (nullable = false)
 |-- comun_species_name: string (nullable = false)
 |-- boroname: string (nullable = false)
 |-- borocode: integer (nullable = true)
 |-- state: string (nullable = false)
 |-- sidw_crack: string (nullable = true)
 |-- sidw_raise: string (nullable = true)
 |-- problems_canopy_general: string (nullable = false)
 |-- problems_guard_general: string (nullable = false)
 |-- problems_wires_general: string (nullable = false)
 |-- problems_paving_general: string (nullable = false)
 |-- problems_outlet_general: string (nullable = false)
 |-- problems_shoes_general: string (nullable = false)
 |-- problems_lights_general: string (nullable = false)
 |-- problems_trunk_general: string (nullable = false)
 |-- sidewalk: string (nullable = false)
 |

In [41]:
#Carrega o dataset arvores1995
hdfs_path3 = "hdfs://hdfs-nn:9000/AreasVerdes/bronze/arvores_csv/arvores1995.csv"
customSchema = StructType([
    StructField("tree_id", IntegerType(), True),        
    StructField("address", StringType(), True),
    StructField("House_Number", IntegerType(), True),
    StructField("Street", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Community_Board", StringType(), True),
    StructField("Site", StringType(), True),
    StructField("Species", StringType(), True),
    StructField("tree_diameter", IntegerType(), True),
    StructField("health", StringType(), True),
    StructField("Wires", StringType(), True),
    StructField("sidewalk", StringType(), True),
    StructField("Support_Structure", StringType(), True),
    StructField("boroname", StringType(), True),
    StructField("X", StringType(), True),
    StructField("Y", StringType(), True),
    StructField("Longitude", StringType(), True),
    StructField("Latitude", StringType(), True),
    StructField("cb_new", StringType(), True),
    StructField("zip_new", StringType(), True),
    StructField("censusTract", StringType(), True),
    StructField("censusBlock", StringType(), True),
    StructField("Nta_2010", StringType(), True),
    StructField("SegmentID", StringType(), True),
    StructField("comun_species_name", StringType(), True),
    StructField("latin_species_name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("council_district", StringType(), True),
    StructField("bin", StringType(), True),
    StructField("BBl", StringType(), True),
    StructField("Zip_codes", StringType(), True),
     StructField("Community_districts", StringType(), True),
    StructField("Borough_boundaries", StringType(), True),
    StructField("city_council", StringType(), True),
    StructField("Police_Precints", StringType(), True),
])

arvores1995 = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path3)

In [42]:
#Apenas as colunas que quero
arvores1995 = arvores1995.select("tree_id","address","tree_diameter","health","sidewalk","boroname","comun_species_name","latin_species_name")

In [43]:
# Coluna Health Excellent = Good / Critical,Shaft,Stump = Poor / Unknown, Planting Space = Não temos informação
arvores1995 = arvores1995.withColumn("health", when(arvores1995.health == "Excellent","Good") \
      .when(arvores1995.health == "Critical","Poor") \
      .when(arvores1995.health == "Shaft","Poor") \
      .when(arvores1995.health == "Stump","Poor") \
      .when(arvores1995.health == "Unknown","Unknown") \
      .when(arvores1995.health == "Planting Space", "Unknown") \
      .otherwise(arvores1995.health))

In [44]:
# Coluna Sidewalk Na = Não temos informação / good = nodamge / raised = damage
arvores1995 = arvores1995.withColumn("sidewalk", when(arvores1995.sidewalk == "NA","Unknown") \
      .when(arvores1995.sidewalk == "Good","NoDamage") \
      .when(arvores1995.sidewalk == "Raised","Damage") \
      .otherwise(arvores1995.sidewalk))

In [45]:
# Cria Coluna Borocode esta coluna contem um numero que representa o bairro 
arvores1995 = arvores1995.withColumn("borocode", lit(1))
arvores1995 = arvores1995.withColumn("borocode", when(arvores1995.boroname == "Queens",4) \
      .when(arvores1995.boroname == "Brooklyn",3) \
      .when(arvores1995.boroname == "Manhattan",1) \
      .when(arvores1995.boroname == "Bronx",2) \
      .when(arvores1995.boroname == "Staten Island",5) \
      .otherwise(arvores1995.borocode))

In [46]:
# Cria as colunas das outras tabelas
arvores1995 = arvores1995.withColumn("year", lit(1995))
arvores1995 = arvores1995.withColumn("state", lit("New York"))
arvores1995 = arvores1995.withColumn("problems_canopy_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_guard_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_wires_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_paving_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_outlet_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_shoes_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_lights_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_trunk_general", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_trunk_light", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_trunk_wire", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_trunk_other", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_root_stone", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_root_grate", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_root_other", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_branch_light", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_branch_shoe", lit("Unknown"))
arvores1995 = arvores1995.withColumn("problems_branch_other", lit("Unknown"))
arvores1995 = arvores1995.withColumn("user_type", lit("Unknown"))

In [47]:
#Seleciono as colunas que quero nas arvores 1995
arvores1995_Guardar = arvores1995.select("tree_id","year","state","address","boroname","borocode","latin_species_name","comun_species_name","health","tree_diameter","sidewalk","problems_canopy_general","problems_guard_general","problems_wires_general","problems_paving_general","problems_outlet_general","problems_shoes_general","problems_lights_general","problems_trunk_general","problems_trunk_light","problems_trunk_wire","problems_trunk_other","problems_root_stone","problems_root_grate","problems_root_other", "problems_branch_light", "problems_branch_shoe","problems_branch_other", "user_type")
arvores1995_Guardar.printSchema()

root
 |-- tree_id: integer (nullable = true)
 |-- year: integer (nullable = false)
 |-- state: string (nullable = false)
 |-- address: string (nullable = true)
 |-- boroname: string (nullable = true)
 |-- borocode: integer (nullable = false)
 |-- latin_species_name: string (nullable = true)
 |-- comun_species_name: string (nullable = true)
 |-- health: string (nullable = true)
 |-- tree_diameter: integer (nullable = true)
 |-- sidewalk: string (nullable = true)
 |-- problems_canopy_general: string (nullable = false)
 |-- problems_guard_general: string (nullable = false)
 |-- problems_wires_general: string (nullable = false)
 |-- problems_paving_general: string (nullable = false)
 |-- problems_outlet_general: string (nullable = false)
 |-- problems_shoes_general: string (nullable = false)
 |-- problems_lights_general: string (nullable = false)
 |-- problems_trunk_general: string (nullable = false)
 |-- problems_trunk_light: string (nullable = false)
 |-- problems_trunk_wire: string (nul

In [48]:
#Seleciono as colunas que quero nas arvores 2005
arvores2005_Guardar = arvores2005.select("tree_id","year","state","address","boroname","borocode","latin_species_name","comun_species_name","health","tree_diameter","sidewalk","problems_canopy_general","problems_guard_general","problems_wires_general","problems_paving_general","problems_outlet_general","problems_shoes_general","problems_lights_general","problems_trunk_general","problems_trunk_light","problems_trunk_wire","problems_trunk_other","problems_root_stone","problems_root_grate","problems_root_other", "problems_branch_light", "problems_branch_shoe","problems_branch_other", "user_type")
arvores2005_Guardar.printSchema()

root
 |-- tree_id: integer (nullable = true)
 |-- year: integer (nullable = false)
 |-- state: string (nullable = false)
 |-- address: string (nullable = true)
 |-- boroname: string (nullable = false)
 |-- borocode: integer (nullable = true)
 |-- latin_species_name: string (nullable = false)
 |-- comun_species_name: string (nullable = false)
 |-- health: string (nullable = false)
 |-- tree_diameter: integer (nullable = true)
 |-- sidewalk: string (nullable = false)
 |-- problems_canopy_general: string (nullable = false)
 |-- problems_guard_general: string (nullable = false)
 |-- problems_wires_general: string (nullable = false)
 |-- problems_paving_general: string (nullable = false)
 |-- problems_outlet_general: string (nullable = false)
 |-- problems_shoes_general: string (nullable = false)
 |-- problems_lights_general: string (nullable = false)
 |-- problems_trunk_general: string (nullable = false)
 |-- problems_trunk_light: string (nullable = false)
 |-- problems_trunk_wire: string 

In [49]:
#Seleciono as colunas que quero nas arvores2015
arvores2015_Guardar = arvores2015.select("tree_id","year","state","address","boroname","borocode","latin_species_name","comun_species_name","health","tree_diameter","sidewalk","problems_canopy_general","problems_guard_general","problems_wires_general","problems_paving_general","problems_outlet_general","problems_shoes_general","problems_lights_general","problems_trunk_general","problems_trunk_light","problems_trunk_wire","problems_trunk_other","problems_root_stone","problems_root_grate","problems_root_other", "problems_branch_light", "problems_branch_shoe","problems_branch_other", "user_type")
arvores2015_Guardar.printSchema()

root
 |-- tree_id: integer (nullable = true)
 |-- year: integer (nullable = false)
 |-- state: string (nullable = true)
 |-- address: string (nullable = true)
 |-- boroname: string (nullable = true)
 |-- borocode: integer (nullable = true)
 |-- latin_species_name: string (nullable = false)
 |-- comun_species_name: string (nullable = false)
 |-- health: string (nullable = false)
 |-- tree_diameter: integer (nullable = true)
 |-- sidewalk: string (nullable = false)
 |-- problems_canopy_general: string (nullable = false)
 |-- problems_guard_general: string (nullable = false)
 |-- problems_wires_general: string (nullable = false)
 |-- problems_paving_general: string (nullable = false)
 |-- problems_outlet_general: string (nullable = false)
 |-- problems_shoes_general: string (nullable = false)
 |-- problems_lights_general: string (nullable = false)
 |-- problems_trunk_general: string (nullable = false)
 |-- problems_trunk_light: string (nullable = true)
 |-- problems_trunk_wire: string (nu

In [50]:
#Junta o dataset arvores2015_Guardar com o arvores2005_Guardar
arvoresFinal = arvores2015_Guardar.union(arvores2005_Guardar)

In [51]:
#Junta o dataset arvores1995 aos outros dois datasets
arvoresFinal_Guardar = arvoresFinal.union(arvores1995_Guardar)

In [52]:
arvoresFinal_Guardar.createOrReplaceTempView("episodes")

sqlized_df = spark.sql(
    """
    SELECT Distinct health
    FROM episodes
    """
)

sqlized_df.show()

+-------+
| health|
+-------+
|   Dead|
|   Good|
|Unknown|
|   Fair|
|   Poor|
+-------+



In [53]:
#write df to hive deltalake_table
#.select("customer_name","month", "sales", "year")  reorder columns to match parquet table sequence
arvoresFinal_Guardar \
    .select("tree_id","year","state","address","boroname","borocode","latin_species_name","comun_species_name","health","tree_diameter","sidewalk","problems_canopy_general","problems_guard_general","problems_wires_general","problems_paving_general","problems_outlet_general","problems_shoes_general","problems_lights_general","problems_trunk_general","problems_trunk_light","problems_trunk_wire","problems_trunk_other","problems_root_stone","problems_root_grate","problems_root_other", "problems_branch_light", "problems_branch_shoe","problems_branch_other", "user_type") \
    .write \
    .mode("overwrite") \
    .partitionBy("year") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/silver/arvores")

In [54]:
spark.sql(
    """
    SHOW TABLES FROM AreasVerdes
    """
).show()

+-----------+---------+-----------+
|  namespace|tableName|isTemporary|
+-----------+---------+-----------+
|areasverdes|  arvores|      false|
|           | episodes|      false|
+-----------+---------+-----------+



In [56]:
spark.sql(
    """
    SELECT Distinct year,health
    FROM AreasVerdes.arvores_table
    """
).show()

AnalysisException: Table or view not found: AreasVerdes.arvores_table; line 3 pos 9;
'Distinct
+- 'Project ['year, 'health]
   +- 'UnresolvedRelation [AreasVerdes, arvores_table], [], false


In [88]:
spark.stop()