#Silver Layer Creation

In [0]:
%run ./Transformation_Functions

## Read Files From Bronze Layer

In [0]:
service_credential = dbutils.secrets.get(scope="databricks-app-kv",key="databricks-application")

spark.conf.set("fs.azure.account.auth.type.20230821desa.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.20230821desa.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.20230821desa.dfs.core.windows.net", "de4ff859-02b1-4e2f-9d16-b578fa03df4f")
spark.conf.set("fs.azure.account.oauth2.client.secret.20230821desa.dfs.core.windows.net", service_credential)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.20230821desa.dfs.core.windows.net", "https://login.microsoftonline.com/33da9f3f-4c1a-4640-8ce1-3f63024aea1d/oauth2/token")

In [0]:
df = (spark.read.format('parquet')
      .option('header', 'true')
      .load('abfss://team2-project2@20230821desa.dfs.core.windows.net/BronzeLayer/*/*.parquet'))

## Create Actors Table

In [0]:
actor_df = df.select(["actor_id", "actor_login", "actor_url", "actor_avatar", "actor_display_login", "actor_gravatar_id"])
# droping gravatarid because every record has an empty string as its value
actor_df = actor_df.drop("actor_gravatar_id")
actor_df = actor_df.dropDuplicates(subset=["actor_id"])
actor_df = actor_df.na.drop(subset=['actor_id'])

## Create Repo Table

In [0]:
repo_df = df.select(['repo_id', 'repo_name', 'repo_url'])
repo_df = repo_df.dropDuplicates(subset=['repo_id'])
repo_df = repo_df.na.drop(subset=['repo_id'])

## Create Org Table

In [0]:
org_df = df.select(['org_id', 'org_login', 'org_url', 'org_avatar_url', 'org_gravatar_id'])
# droping gravatarid because every record has an empty string as its value
org_df = org_df.drop('org_gravatar_id')
org_df = org_df.dropDuplicates(subset=['org_id'])
org_df = org_df.na.drop(subset=['org_id'])

##Create Event Tables

In [0]:
event_tables = create_event_tables(df)

## Save tables to data lake

In [0]:
actor_df.repartition(3).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/actors")
repo_df.repartition(5).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/repos")
org_df.repartition(1).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/orgs")

In [0]:
event_tables["PullRequestEvent"].repartition(150).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/PullRequestEvent")
event_tables["PushEvent"].repartition(90).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/PushEvent")
event_tables["IssueCommentEvent"].repartition(50).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/IssueCommentEvent")
event_tables["CreateEvent"].repartition(3).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/CreateEvent")
event_tables["DeleteEvent"].repartition(1).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/DeleteEvent")
event_tables["WatchEvent"].repartition(1).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/WatchEvent")
event_tables["ReleaseEvent"].repartition(2).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/ReleaseEvent")
event_tables["ForkEvent"].repartition(13).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/ForkEvent")
event_tables["IssuesEvent"].repartition(9).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/IssuesEvent")
event_tables["PullRequestReviewEvent"].repartition(20).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/PullRequestReviewEvent")
event_tables["PullRequestReviewCommentEvent"].repartition(10).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/PullRequestReviewCommentEvent")
event_tables["CommitCommentEvent"].repartition(2).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/CommitCommentEvent")
event_tables["MemberEvent"].repartition(1).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/MemberEvent")
event_tables["PublicEvent"].repartition(1).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/PublicEvent")
event_tables["GollumEvent"].repartition(1).write.format("parquet").mode("overwrite").save(f"abfss://team2-project2@20230821desa.dfs.core.windows.net/SilverLayer/GollumEvent")