In [1]:
import polars as pl
import json
import gzip

In [2]:
with gzip.open('../data/2022-01-01-0.json.gz', 'rb') as gzip_file, open('../data/2022-01-01-0.json', 'wb') as decompressed_file:
    decompressed_file.write(gzip_file.read())

In [3]:
# Specify the path to the input file and output file
input_file_path = '../data/2022-01-01-0.json'
output_file_path = '../data/combined/2022-01-01-0.json'

# Read the input file and load the JSON data
with open(input_file_path, 'r') as f:
    json_data = f.read()

# Split the JSON data into individual dictionaries
dicts = json_data.strip().split('\n')

# Combine the dictionaries into a list
data_list = [json.loads(d) for d in dicts]


In [22]:
schema = {
    "id": pl.Utf8,
    "type": pl.Utf8,
    "actor": pl.Struct(
        {
            "id": pl.Int32,
            "login": pl.Utf8,
            "display_login": pl.Utf8,
            "gravatar_id": pl.Utf8,
            "url": pl.Utf8,
            "avatar_url": pl.Utf8,
        }
    ),
    "repo": pl.Struct(
        {
            "id": pl.Int32,
            "name": pl.Utf8,
            "url": pl.Utf8,
        }
    ),
    "payload": pl.Struct(
        {
            "push_id": pl.Utf8,
            "size": pl.Int32,
            "distinct_size": pl.Int32,
            "ref": pl.Utf8,
            "head": pl.Utf8,
            "before": pl.Utf8,
            "commits": pl.List(
                pl.Struct(
                    {
                        "sha": pl.Utf8,
                        "author": pl.Struct(
                            {
                                "email": pl.Utf8,
                                "name": pl.Utf8,
                            }
                        ),
                        "message": pl.Utf8,
                        "distinct": pl.Boolean,
                        "url": pl.Utf8,
                    }
                ),
            ),
        }
    ),
    "public": pl.Boolean,
    "created_at": pl.Utf8,
    "org": pl.Struct(
        {
            "id": pl.Int32,
            "login": pl.Utf8,
            "gravatar_id": pl.Utf8,
            "url": pl.Utf8,
            "avatar_url": pl.Utf8,
        }
    ),
    "other": pl.Utf8,
}


In [35]:
df = pl.from_dicts(data_list, schema=schema)

In [36]:
# 2022-01-01T00:42:04Z
df = df.with_columns(df['created_at'].str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%SZ").alias('created_at'))
df = df.drop('public')
df

id,type,actor,repo,payload,created_at,org,other
str,str,struct[6],struct[3],struct[7],datetime[μs],struct[5],str
"""19541174222""","""PushEvent""","{85853927,""Mengrio"",""Mengrio"","""",""https://api.github.com/users/Mengrio"",""https://avatars.githubusercontent.com/u/85853927?""}","{443354520,""Mengrio/Hisoka-Morou"",""https://api.github.com/repos/Mengrio/Hisoka-Morou""}","{""8732292002"",1,1,""refs/heads/master"",""51608b1de914eeab7fc101d73d3c0a8756d40d1f"",""6f59f9b4e92c202318165eecd53e82076dc0c008"",[{""51608b1de914eeab7fc101d73d3c0a8756d40d1f"",{""85853927+Mengrio@users.noreply.github.com"",""Mengrio""},""update rio.json"",true,""https://api.github.com/repos/Mengrio/Hisoka-Morou/commits/51608b1de914eeab7fc101d73d3c0a8756d40d1f""}]}",2022-01-01 00:00:00,"{null,null,null,null,null}",
"""19541174225""","""CreateEvent""","{35642244,""denis-onder"",""denis-onder"","""",""https://api.github.com/users/denis-onder"",""https://avatars.githubusercontent.com/u/35642244?""}","{443443039,""denis-onder/design-patterns-and-solid-principles"",""https://api.github.com/repos/denis-onder/design-patterns-and-solid-principles""}","{null,null,null,""master"",null,null,null}",2022-01-01 00:00:00,"{null,null,null,null,null}",
"""19541174226""","""IssueCommentEv...","{1145762,""sffc"",""sffc"","""",""https://api.github.com/users/sffc"",""https://avatars.githubusercontent.com/u/1145762?""}","{377296713,""rust-diplomat/diplomat"",""https://api.github.com/repos/rust-diplomat/diplomat""}","{null,null,null,null,null,null,null}",2022-01-01 00:00:00,"{85967389,""rust-diplomat"","""",""https://api.github.com/orgs/rust-diplomat"",""https://avatars.githubusercontent.com/u/85967389?""}",
"""19541174233""","""CreateEvent""","{41898282,""github-actions[bot]"",""github-actions"","""",""https://api.github.com/users/github-actions[bot]"",""https://avatars.githubusercontent.com/u/41898282?""}","{439105823,""secbyd/log4shell"",""https://api.github.com/repos/secbyd/log4shell""}","{null,null,null,""log4shell_info_20211231"",null,null,null}",2022-01-01 00:00:00,"{null,null,null,null,null}",
"""19541174237""","""PullRequestEve...","{39919020,""brunosana"",""brunosana"","""",""https://api.github.com/users/brunosana"",""https://avatars.githubusercontent.com/u/39919020?""}","{405684714,""brunosana/bet-management-backend"",""https://api.github.com/repos/brunosana/bet-management-backend""}","{null,null,null,null,null,null,null}",2022-01-01 00:00:00,"{null,null,null,null,null}",
"""19541174243""","""IssueCommentEv...","{37936606,""github-learning-lab[bot]"",""github-learning-lab"","""",""https://api.github.com/users/github-learning-lab[bot]"",""https://avatars.githubusercontent.com/u/37936606?""}","{443443143,""desfolio/github-upload"",""https://api.github.com/repos/desfolio/github-upload""}","{null,null,null,null,null,null,null}",2022-01-01 00:00:00,"{null,null,null,null,null}",
"""19541174242""","""PushEvent""","{41898282,""github-actions[bot]"",""github-actions"","""",""https://api.github.com/users/github-actions[bot]"",""https://avatars.githubusercontent.com/u/41898282?""}","{369713083,""civictechsweden/JagVillHaVaccin"",""https://api.github.com/repos/civictechsweden/JagVillHaVaccin""}","{""8732292011"",1,1,""refs/heads/master"",""7b2c42c22c984ed5eea9315e6cacc968727ed587"",""109c6b6744a523988a3fe4d68b5a22905fe22c61"",[{""7b2c42c22c984ed5eea9315e6cacc968727ed587"",{""pierre@mesu.re"",""Pierre Mesure (Github Actions)""},""Updating the times for Region 01"",true,""https://api.github.com/repos/civictechsweden/JagVillHaVaccin/commits/7b2c42c22c984ed5eea9315e6cacc968727ed587""}]}",2022-01-01 00:00:00,"{44578964,""civictechsweden"","""",""https://api.github.com/orgs/civictechsweden"",""https://avatars.githubusercontent.com/u/44578964?""}",
"""19541174244""","""PushEvent""","{83132190,""SamaaKhalifa"",""SamaaKhalifa"","""",""https://api.github.com/users/SamaaKhalifa"",""https://avatars.githubusercontent.com/u/83132190?""}","{430153049,""yomnamahmoud100/Transportation_Project"",""https://api.github.com/repos/yomnamahmoud100/Transportation_Project""}","{""8732292014"",1,1,""refs/heads/master"",""a5cbb89c7adfd4a1601151ad17aad6db23eebfa3"",""6f05fa0ce05a210985e531ea50673bb93c4665d5"",[{""a5cbb89c7adfd4a1601151ad17aad6db23eebfa3"",{""83132190+SamaaKhalifa@users.noreply.github.com"",""SamaaKhalifa""},""DONE"",true,""https://api.github.com/repos/yomnamahmoud100/Transportation_Project/commits/a5cbb89c7adfd4a1601151ad17aad6db23eebfa3""}]}",2022-01-01 00:00:00,"{null,null,null,null,null}",
"""19541174246""","""WatchEvent""","{7829787,""killergeek"",""killergeek"","""",""https://api.github.com/users/killergeek"",""https://avatars.githubusercontent.com/u/7829787?""}","{268578645,""c3d2/C3-PR"",""https://api.github.com/repos/c3d2/C3-PR""}","{null,null,null,null,null,null,null}",2022-01-01 00:00:00,"{729895,""c3d2"","""",""https://api.github.com/orgs/c3d2"",""https://avatars.githubusercontent.com/u/729895?""}",
"""19541174256""","""DeleteEvent""","{8078968,""jbrockmendel"",""jbrockmendel"","""",""https://api.github.com/users/jbrockmendel"",""https://avatars.githubusercontent.com/u/8078968?""}","{96820953,""jbrockmendel/pandas"",""https://api.github.com/repos/jbrockmendel/pandas""}","{null,null,null,""depr-now"",null,null,null}",2022-01-01 00:00:01,"{null,null,null,null,null}",


In [37]:
df.write_parquet('./2022-01-01-0.parquet', compression='snappy')