In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [0]:
df_yield = spark.table("priceriskanalysis.silver.usda_corn_yield")
df_production = spark.table("priceriskanalysis.silver.usda_corn_production")
df_area = spark.table("priceriskanalysis.silver.usda_corn_area_harvested")
df_price = spark.table("priceriskanalysis.silver.usda_corn_price_received")

In [0]:
yield_yearly = df_yield.groupBy("year").agg(round(avg("Value"),2).alias("avg_yield"))
production_yearly = df_production.groupBy("year").agg(round(sum("Value"),2).alias("total_production"))
area_yearly = df_area.groupBy("year").agg(round(sum("Value"),2).alias("toal_area_harvested"))
price_yearly = df_price.groupBy("year").agg(round(avg("Value"),2).alias("avg_price_received"))
gold_yearly = yield_yearly.join(production_yearly, "year").join(area_yearly, "year").join(price_yearly, "year").orderBy("year")
display(gold_yearly)

In [0]:
gold_yearly.write.mode("overwrite").saveAsTable("priceriskanalysis.gold.usda_corn_gold")

In [0]:
yeild_state = df_yield.groupBy("state_name","year").agg(round(avg("Value"),2).alias("avg_yield"))
production_state = df_production.groupBy("state_name","year").agg(round(sum("Value"),2).alias("total_production"))
area_state = df_area.groupBy("state_name","year").agg(round(sum("Value"),2).alias("toal_area_harvested"))
price_state = df_price.groupBy("state_name","year").agg(round(avg("Value"),2).alias("avg_price_received"))
gold_state = yeild_state.join(production_state, ["state_name","year"]).join(area_state, ["state_name","year"]).join(price_state, ["state_name","year"]).orderBy("year")
display(gold_state)

In [0]:
gold_state.write.mode("overwrite").saveAsTable("priceriskanalysis.gold.usda_corn_gold_state_analysis")

In [0]:
yield_efficiency = gold_state.withColumn("yield_efficieny",
                                         round(when(col("toal_area_harvested")>0,
                                                    col("total_production")/col("toal_area_harvested")).otherwise(None),2))
yield_efficiency = yield_efficiency.select("state_name","year","avg_yield","total_production","toal_area_harvested","yield_efficieny")
display(yield_efficiency)

In [0]:
yield_efficiency.write.mode("overwrite").saveAsTable("priceriskanalysis.gold.usda_corn_yield_efficiency")

In [0]:
state_risk = gold_state.groupBy("state_name")\
                       .agg(round(stddev("avg_yield"),2).alias("yield_std"),
                            round(stddev("total_production"),2).alias("production_std"),
                            round(stddev("toal_area_harvested"),2).alias("area_std"),
                            round(stddev("avg_price_received"),2).alias("price_std"))
display(state_risk)
state_risk.write.mode("overwrite").saveAsTable("priceriskanalysis.gold.usda_corn_state_risk")