In [0]:
print('hello world')

hello world


In [0]:
from pyspark.sql import functions as fn, types as T
import pandas as pd



In [0]:
# lets import a couple of time series datasets

# first a births dataset
births_url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-total-female-births.csv"
births_pd = pd.read_csv(births_url)
births_df = spark.createDataFrame(births_pd)

# and a sunspots dataset
sunspots_url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/monthly-sunspots.csv"
sunspots_pd = pd.read_csv(sunspots_url)
sunspots_df = spark.createDataFrame(sunspots_pd)

In [0]:
# now we need to add the two datasets together but one is datewies and one monthwise
# we could use a join or we could add compatible columns and union

births_df_compatible = births_df.withColumn(
  'Date', fn.to_date('Date', 'yyyy-MM-dd')
).withColumn(
  'Sunspots' , fn.lit(None).cast(T.LongType())
).select(
  'Date',
  'Sunspots',
  'Births',
)

sunspots_df_compatible = sunspots_df.withColumn(
  'Date', fn.to_date('Month', 'yyyy-MM')
).withColumn(
   'Births' , fn.lit(None).cast(T.DoubleType())
).select(
  'Date',
  'Sunspots',
  'Births',
).filter(
  fn.year('Date') == 1959
)

unioned_df = sunspots_df_compatible.union(births_df_compatible)

display(unioned_df)

Date,Sunspots,Births
1959-01-01,217.4,
1959-02-01,143.1,
1959-03-01,185.7,
1959-04-01,163.3,
1959-05-01,172.0,
1959-06-01,168.7,
1959-07-01,149.6,
1959-08-01,199.6,
1959-09-01,145.2,
1959-10-01,111.4,


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
grouped_df = unioned_df.groupBy(
  fn.month('Date').alias('Month')
).agg(
  fn.sum('Births').alias('Births'),
  fn.sum('Sunspots').alias('Sunspots'),
).orderBy(
  'Month'
)

display(grouped_df)

Month,Births,Sunspots
1,1213.0,217.4
2,1148.0,143.1
3,1218.0,185.7
4,1195.0,163.3
5,1208.0,172.0
6,1212.0,168.7
7,1300.0,149.6
8,1351.0,199.6
9,1446.0,145.2
10,1368.0,111.4


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
joined_df = sunspots_df_compatible.select('Date', 'Sunspots').join(
  births_df_compatible.select('Date', 'Births'),
  on = sunspots_df_compatible.Date == births_df_compatible.Date,
  how = 'outer'
)

display(joined_df)

Date,Sunspots,Date.1,Births
1959-01-01,217.4,1959-01-01,35
,,1959-01-02,32
,,1959-01-03,30
,,1959-01-04,31
,,1959-01-05,44
,,1959-01-06,29
,,1959-01-07,45
,,1959-01-08,43
,,1959-01-09,38
,,1959-01-10,27


Databricks visualization. Run in Databricks to view.