In [1]:
%load_ext sparksql_magic
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col

spark = SparkSession.builder.appName("CreateTable").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/11 18:13:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [11]:
df = spark.read.json("/data/*.json.gz")
# Filter for IssuesEvent
df = df.filter(df["type"] == "IssuesEvent")
df = df.select(
    col("actor"),
    col("created_at"),
    col("id").alias("event_id"),
    col("org"),
    col("repo"),
    col("payload.action"),
    col("payload.issue")
)

                                                                                

In [12]:
df.createOrReplaceTempView("df")

In [19]:
%%sql
select repo.* from df

id,name,url
645980442,mo9a7i/time_now,https://api.github.com/repos/mo9a7i/time_now
599907731,tangshimin/MuJing,https://api.github.com/repos/tangshimin/MuJing
675562769,f13end/metaverse-news,https://api.github.com/repos/f13end/metaverse-news
656067195,kinjalsoftnoesis/Test,https://api.github.com/repos/kinjalsoftnoesis/Test
733519447,haricotando/mastermind,https://api.github.com/repos/haricotando/mastermind
509162239,wfjsw/status-winterco-org,https://api.github.com/repos/wfjsw/status-winterco-org
737410173,yaya1986mia/mint,https://api.github.com/repos/yaya1986mia/mint
458404007,iptv-org/database,https://api.github.com/repos/iptv-org/database
458404007,iptv-org/database,https://api.github.com/repos/iptv-org/database
458404007,iptv-org/database,https://api.github.com/repos/iptv-org/database


## 🌊🐳🧊 Example of creating an Iceberg Table

In [7]:
%%sql
CREATE TABLE IF NOT EXISTS iceberg.github_issues (
    actor STRING,
    created_at TIMESTAMP,
    event_id STRING,
    org STRING,
    action STRING,
    issue STRING
) USING ICEBERG
PARTITIONED BY (days(created_at));

## To insert data:
this will fail because the data types are wrong

In [18]:
# df.write.format("iceberg") \
#     .mode("overwrite") \
#     .saveAsTable("iceberg.github_issues")

In [10]:
%%sql
DESCRIBE TABLE iceberg.github_issues;

col_name,data_type,comment
actor,string,
created_at,timestamp,
event_id,string,
org,string,
action,string,
issue,string,
,,
# Partitioning,,
Part 0,days(created_at),


In [15]:
%%sql
DESCRIBE FORMATTED iceberg.github_issues;

col_name,data_type,comment
actor,string,
created_at,timestamp,
event_id,string,
org,string,
action,string,
issue,string,
,,
# Partitioning,,
Part 0,days(created_at),
,,


In [16]:
%%sql
INSERT INTO iceberg.github_issues 
VALUES ('test_actor', TIMESTAMP '2024-03-11 12:00:00', '12345', 'test_org', 'opened', 'test_issue');


                                                                                

In [17]:
%%sql 
select * from iceberg.github_issues 

actor,created_at,event_id,org,action,issue
test_actor,2024-03-11 12:00:00,12345,test_org,opened,test_issue


In [28]:
%%sql
CREATE TABLE IF NOT EXISTS iceberg.dim_repo (
    id BIGINT,
    name STRING,
    url STRING
) USING ICEBERG

In [23]:
%%sql
INSERT OVERWRITE iceberg.dim_repo
SELECT DISTINCT
    CAST(repo.id AS BIGINT) AS id,
    repo.name,
    repo.url
FROM df

                                                                                

In [24]:
%%sql
select * from iceberg.dim_repo

id,name,url
645980442,mo9a7i/time_now,https://api.github.com/repos/mo9a7i/time_now
599907731,tangshimin/MuJing,https://api.github.com/repos/tangshimin/MuJing
675562769,f13end/metaverse-news,https://api.github.com/repos/f13end/metaverse-news
656067195,kinjalsoftnoesis/Test,https://api.github.com/repos/kinjalsoftnoesis/Test
733519447,haricotando/mastermind,https://api.github.com/repos/haricotando/mastermind
509162239,wfjsw/status-winterco-org,https://api.github.com/repos/wfjsw/status-winterco-org
737410173,yaya1986mia/mint,https://api.github.com/repos/yaya1986mia/mint
458404007,iptv-org/database,https://api.github.com/repos/iptv-org/database
458404007,iptv-org/database,https://api.github.com/repos/iptv-org/database
458404007,iptv-org/database,https://api.github.com/repos/iptv-org/database


In [27]:
%%sql
drop table iceberg.dim_repo