In [1]:
import argparse

from pyspark import SparkConf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
import pyspark.sql.types as T

ModuleNotFoundError: No module named 'pyspark'

In [None]:
# adding iceberg configs
conf = (
    SparkConf()
    .set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") # Use Iceberg with Spark
    .set("spark.sql.defaultCatalog", "demo") # Name of the Iceberg catalog
    .set("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .set("spark.sql.catalog.demo.warehouse", "s3a://gh-archive-data-curated/iceberg")
    .set("spark.sql.catalog.demo.s3.endpoint", "http://minio:9000")
    .set("spark.sql.catalogImplementation", "in-memory")
    .set("spark.sql.catalog.demo.type", "hadoop") # Iceberg catalog type
    .set("spark.executor.heartbeatInterval", "300000")
    .set("spark.network.timeout", "400000")
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Disable below line to see INFO logs
spark.sparkContext.setLogLevel("ERROR")


def load_config(spark_context: SparkContext):
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.access.key", "minio_access_key")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.secret.key","minio_secret_key")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "false")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
    # spark_context._jsc.hadoopConfiguration().set("fs.s3a.attempts.maximum", "1")
    # spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.establish.timeout", "5000")
    # spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "10000")


load_config(spark.sparkContext)


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
24/02/10 23:15:29 INFO SparkContext: Running Spark version 3.3.0-amzn-1
24/02/10 23:15:29 INFO ResourceUtils: No custom resources configured for spark.driver.
24/02/10 23:15:29 INFO SparkContext: Submitted application: pyspark-shell
24/02/10 23:15:29 INFO Res

In [None]:
read_filepath = "s3a://gh-archive-data-raw/gh-archives/year=2023/month=01/day=01/hour=4/"

In [None]:
if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--date", help="Date in format YYYY-MM-DD", required=True)
    # parser.add_argument("--source_files_pattern", help="Source files pattern for the GH archive to process.", required=True)
    # parser.add_argument("--destination_files_pattern", help="Destination files pattern for the GH archive to process.", required=True)

    # args = parser.parse_args()
    # date = args.date

    # read_filepath = args.source_files_pattern
    # write_filepath = args.destination_files_pattern
    # read_filepath=read_filepath.format(date)
    # print(f"date received: {date}")


    print(f"read_filepath: {read_filepath}")
    df = spark.read.json(read_filepath)


    allowed_events = [
        "PushEvent",
        "ForkEvent",
        "PublicEvent",
        "WatchEvent",
        "PullRequestEvent",
    ]

    main_df = df.select(
        F.col("id").alias("event_id"),
        F.col("type").alias("event_type"),
        F.to_timestamp( F.col("created_at"), "yyyy-MM-dd'T'HH:mm:ss'Z'" ).alias("created_at"),
        F.col("repo.id").alias("repository_id"),
        F.col("repo.name").alias("repository_name"),
        F.col("repo.url").alias("repository_url"),
        F.col("actor.id").alias("user_id"),
        F.col("actor.login").alias("user_name"),
        F.col("actor.url").alias("user_url"),
        F.col("actor.avatar_url").alias("user_avatar_url"),
        F.col("org.id").alias("org_id"),
        F.col("org.login").alias("org_name"),
        F.col("org.url").alias("org_url"),
        F.col("org.avatar_url").alias("org_avatar_url"),
        F.col("payload.push_id").alias("push_id"),
        F.col("payload.distinct_size").alias("number_of_commits"),
        F.col("payload.pull_request.base.repo.language").alias("language"),
    ).filter(
        F.col("type").isin(allowed_events)
    )

    main_df = main_df.withColumn("year", F.year("created_at")) \
        .withColumn("month", F.month("created_at")) \
        .withColumn("day", F.dayofmonth("created_at")) \
        .withColumn("hour", F.hour("created_at")) \
        .withColumn("minute", F.minute("created_at")) \
        .withColumn("second", F.second("created_at")) \
    
    # add timestamp field
    main_df = main_df.withColumn("ts", F.unix_timestamp("created_at", "yyyy-MM-dd'T'HH:mm:ss'Z'"))


    # write the DataFrame to GCS partitioned by year, month, and day and bucketed by hour and minute
    # date = date.replace("-", "")

    # main_df.write \
    # .partitionBy("year", "month", "day") \
    # .bucketBy(24, "hour") \
    # .sortBy("hour", "minute") \
    # .option("path", write_filepath) \
    # .option("header", True) \
    # .mode("append") \
    # .saveAsTable(f"table{date}")

read_filepath: s3a://gh-archive-data-raw/gh-archives/year=2023/month=01/day=01/hour=4/


                                                                                

In [None]:
main_df.show()

                                                                                

+-----------+----------------+-------------------+-------------+--------------------+--------------------+---------+-------------------+--------------------+--------------------+--------+-------------------+--------------------+--------------------+-----------+-----------------+----------+----+-----+---+----+------+------+----------+
|   event_id|      event_type|         created_at|repository_id|     repository_name|      repository_url|  user_id|          user_name|            user_url|     user_avatar_url|  org_id|           org_name|             org_url|      org_avatar_url|    push_id|number_of_commits|  language|year|month|day|hour|minute|second|        ts|
+-----------+----------------+-------------------+-------------+--------------------+--------------------+---------+-------------------+--------------------+--------------------+--------+-------------------+--------------------+--------------------+-----------+-----------------+----------+----+-----+---+----+------+------+----

In [None]:
main_df.write.mode("overwrite").saveAsTable("gh.gh_archive_data")



SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                