In [12]:
import urllib.request
import gzip
import json
import polars as pl

In [16]:
# Define the URL
url = 'https://data.gharchive.org/2023-01-01-0.json.gz'

# Define a User-Agent header string that describes the client making the request
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Create a urllib.request.Request object that includes the User-Agent header
request = urllib.request.Request(url, headers=headers)

# Use the urllib.request.urlopen() method to retrieve the contents of the file from the given URL
response = urllib.request.urlopen(request)

# Release the contents of the file from gzip compression
data = gzip.decompress(response.read()).decode()

dicts = data.strip().split('\n')

# # Combine the dictionaries into a list
data_list = [json.loads(d) for d in dicts]


data_list[0]

{'id': '26163418658',
 'type': 'PushEvent',
 'actor': {'id': 119809980,
  'login': 'ehwu106',
  'display_login': 'ehwu106',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/ehwu106',
  'avatar_url': 'https://avatars.githubusercontent.com/u/119809980?'},
 'repo': {'id': 582174284,
  'name': 'ehwu106/Gmail-Filter-Solution',
  'url': 'https://api.github.com/repos/ehwu106/Gmail-Filter-Solution'},
 'payload': {'push_id': 12147229638,
  'size': 2,
  'distinct_size': 2,
  'ref': 'refs/heads/main',
  'head': '8fbcb0a5be7f1ae98c620ffc445f8212da279c4b',
  'before': '27e76fd2920c98cf825daefa9469cb202944d96d',
  'commits': [{'sha': '01882b15808c6cc63f4075eea105de4f608e23aa',
    'author': {'email': 'howard.wu@travasecurity.com', 'name': 'Howard Wu'},
    'message': 'pushing',
    'distinct': True,
    'url': 'https://api.github.com/repos/ehwu106/Gmail-Filter-Solution/commits/01882b15808c6cc63f4075eea105de4f608e23aa'},
   {'sha': '8fbcb0a5be7f1ae98c620ffc445f8212da279c4b',
    'author': 

In [17]:
schema = {
    "id": pl.Utf8,
    "type": pl.Utf8,
    "actor": pl.Struct(
        {
            "id": pl.Int32,
            "login": pl.Utf8,
            "display_login": pl.Utf8,
            "gravatar_id": pl.Utf8,
            "url": pl.Utf8,
            "avatar_url": pl.Utf8,
        }
    ),
    "repo": pl.Struct(
        {
            "id": pl.Int32,
            "name": pl.Utf8,
            "url": pl.Utf8,
        }
    ),
    "payload": pl.Struct(
        {
            "push_id": pl.Utf8,
            "size": pl.Int32,
            "distinct_size": pl.Int32,
            "ref": pl.Utf8,
            "head": pl.Utf8,
            "before": pl.Utf8,
            "commits": pl.List(
                pl.Struct(
                    {
                        "sha": pl.Utf8,
                        "author": pl.Struct(
                            {
                                "email": pl.Utf8,
                                "name": pl.Utf8,
                            }
                        ),
                        "message": pl.Utf8,
                        "distinct": pl.Boolean,
                        "url": pl.Utf8,
                    }
                ),
            ),
        }
    ),
    "public": pl.Boolean,
    "created_at": pl.Utf8,
    "org": pl.Struct(
        {
            "id": pl.Int32,
            "login": pl.Utf8,
            "gravatar_id": pl.Utf8,
            "url": pl.Utf8,
            "avatar_url": pl.Utf8,
        }
    ),
    "other": pl.Utf8,
}


In [18]:
df = pl.from_dicts(data_list, schema=schema)

In [19]:
df = df.with_columns(df['created_at'].str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%SZ").alias('created_at'))
df = df.drop('public')
df

id,type,actor,repo,payload,created_at,org,other
str,str,struct[6],struct[3],struct[7],datetime[μs],struct[5],str
"""26163418658""","""PushEvent""","{119809980,""ehwu106"",""ehwu106"","""",""https://api.github.com/users/ehwu106"",""https://avatars.githubusercontent.com/u/119809980?""}","{582174284,""ehwu106/Gmail-Filter-Solution"",""https://api.github.com/repos/ehwu106/Gmail-Filter-Solution""}","{""12147229638"",2,2,""refs/heads/main"",""8fbcb0a5be7f1ae98c620ffc445f8212da279c4b"",""27e76fd2920c98cf825daefa9469cb202944d96d"",[{""01882b15808c6cc63f4075eea105de4f608e23aa"",{""howard.wu@travasecurity.com"",""Howard Wu""},""pushing"",true,""https://api.github.com/repos/ehwu106/Gmail-Filter-Solution/commits/01882b15808c6cc63f4075eea105de4f608e23aa""}, {""8fbcb0a5be7f1ae98c620ffc445f8212da279c4b"",{""hwu106@ucsc.edu"",""hwu106""},""push"",true,""https://api.github.com/repos/ehwu106/Gmail-Filter-Solution/commits/8fbcb0a5be7f1ae98c620ffc445f8212da279c4b""}]}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418659""","""PushEvent""","{77187908,""Cefqrn"",""Cefqrn"","""",""https://api.github.com/users/Cefqrn"",""https://avatars.githubusercontent.com/u/77187908?""}","{583140987,""Cefqrn/pyxpr"",""https://api.github.com/repos/Cefqrn/pyxpr""}","{""12147229637"",2,2,""refs/heads/main"",""f64e5c366a20276fd1499cc485e131c08aeba5ee"",""367a5d47552f98e7acffe2f20b9a8c82f34f71a9"",[{""61696611ba5a9edd83c997d8a6cc477fa483c67b"",{""cefqrn@gmail.com"",""Cefqrn""},""made operator functions take pointers to `expression`s instead of `int`s"",true,""https://api.github.com/repos/Cefqrn/pyxpr/commits/61696611ba5a9edd83c997d8a6cc477fa483c67b""}, {""f64e5c366a20276fd1499cc485e131c08aeba5ee"",{""cefqrn@gmail.com"",""Cefqrn""},""moved `operator` back to its own files and made `expression`s generate their text on demand"",true,""https://api.github.com/repos/Cefqrn/pyxpr/commits/f64e5c366a20276fd1499cc485e131c08aeba5ee""}]}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418660""","""IssuesEvent""","{121737278,""LaymooDR"",""LaymooDR"","""",""https://api.github.com/users/LaymooDR"",""https://avatars.githubusercontent.com/u/121737278?""}","{383940088,""ShadowMario/FNF-PsychEngine"",""https://api.github.com/repos/ShadowMario/FNF-PsychEngine""}","{null,null,null,null,null,null,null}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418664""","""WatchEvent""","{89544871,""Aziz403"",""Aziz403"","""",""https://api.github.com/users/Aziz403"",""https://avatars.githubusercontent.com/u/89544871?""}","{2663796,""lexik/LexikTranslationBundle"",""https://api.github.com/repos/lexik/LexikTranslationBundle""}","{null,null,null,null,null,null,null}",2023-01-01 00:00:00,"{568486,""lexik"","""",""https://api.github.com/orgs/lexik"",""https://avatars.githubusercontent.com/u/568486?""}",
"""26163418665""","""PushEvent""","{8517910,""LombiqBot"",""LombiqBot"","""",""https://api.github.com/users/LombiqBot"",""https://avatars.githubusercontent.com/u/8517910?""}","{410004154,""Lombiq/TheBootstrapTheme"",""https://api.github.com/repos/Lombiq/TheBootstrapTheme""}","{""12147229641"",0,0,""refs/heads/master"",""1c0d357e00552ca5a53e2a94573fd9d6f73fcdf5"",""1c0d357e00552ca5a53e2a94573fd9d6f73fcdf5"",[]}",2023-01-01 00:00:00,"{8158177,""Lombiq"","""",""https://api.github.com/orgs/Lombiq"",""https://avatars.githubusercontent.com/u/8158177?""}",
"""26163418667""","""PushEvent""","{41898282,""github-actions[bot]"",""github-actions"","""",""https://api.github.com/users/github-actions[bot]"",""https://avatars.githubusercontent.com/u/41898282?""}","{250035045,""ZamulaK/COVID-19"",""https://api.github.com/repos/ZamulaK/COVID-19""}","{""12147229642"",1,1,""refs/heads/web-data"",""95bf957f5188641598151af8c35872686f18bd85"",""db49e743d0ea2ce8aa87ebfd2de9936853a1ba95"",[{""95bf957f5188641598151af8c35872686f18bd85"",{""jhusystems@gmail.com"",""CSSEGISandData""},""Automated hourly update"",true,""https://api.github.com/repos/ZamulaK/COVID-19/commits/95bf957f5188641598151af8c35872686f18bd85""}]}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418668""","""PushEvent""","{118964436,""SS7SS"",""SS7SS"","""",""https://api.github.com/users/SS7SS"",""https://avatars.githubusercontent.com/u/118964436?""}","{583726411,""SS7SS/Quran_Linux"",""https://api.github.com/repos/SS7SS/Quran_Linux""}","{""12147229640"",1,1,""refs/heads/tepthon"",""2585e306dd64c4b529956550bd2e48467d211fd7"",""39b84f330971e62a6b4d7931f4cbbe045e55c3af"",[{""2585e306dd64c4b529956550bd2e48467d211fd7"",{""qppqmzmz7@gmail.com"",""Txn""},""Workflow : Loop 12/31/22-23:59:58pm"",true,""https://api.github.com/repos/SS7SS/Quran_Linux/commits/2585e306dd64c4b529956550bd2e48467d211fd7""}]}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418676""","""CreateEvent""","{49699333,""dependabot[bot]"",""dependabot"","""",""https://api.github.com/users/dependabot[bot]"",""https://avatars.githubusercontent.com/u/49699333?""}","{441938516,""sonusathyadas/musician-app"",""https://api.github.com/repos/sonusathyadas/musician-app""}","{null,null,null,""dependabot/npm_and_yarn/client/json5-and-react-scripts-2.2.3"",null,null,null}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418680""","""PushEvent""","{41898282,""github-actions[bot]"",""github-actions"","""",""https://api.github.com/users/github-actions[bot]"",""https://avatars.githubusercontent.com/u/41898282?""}","{376650499,""harperreed/crisis-proposal-bot"",""https://api.github.com/repos/harperreed/crisis-proposal-bot""}","{""12147229648"",1,1,""refs/heads/main"",""70762380033ed854f5ef924c6b0e5715c422fa15"",""0cdf9c77a14a605275637e31742cd9d1ddac5a29"",[{""70762380033ed854f5ef924c6b0e5715c422fa15"",{""readme-bot@example.com"",""README-bot""},""Updated state"",true,""https://api.github.com/repos/harperreed/crisis-proposal-bot/commits/70762380033ed854f5ef924c6b0e5715c422fa15""}]}",2023-01-01 00:00:00,"{null,null,null,null,null}",
"""26163418682""","""PushEvent""","{21151734,""drphil3d"",""drphil3d"","""",""https://api.github.com/users/drphil3d"",""https://avatars.githubusercontent.com/u/21151734?""}","{480103899,""drphil3d/uptime"",""https://api.github.com/repos/drphil3d/uptime""}","{""12147229646"",1,1,""refs/heads/master"",""5a451a425e984f59a8070490b8c335921c61a2a4"",""da65d89ba5ac5fd8879ae8e82c19affcecacc5b0"",[{""5a451a425e984f59a8070490b8c335921c61a2a4"",{""73812536+upptime-bot@users.noreply.github.com"",""Upptime Bot""},""🟥 NorCal Machine Works is down (502 in 1408 ms) [skip ci] [upptime]"",true,""https://api.github.com/repos/drphil3d/uptime/commits/5a451a425e984f59a8070490b8c335921c61a2a4""}]}",2023-01-01 00:00:00,"{null,null,null,null,null}",
