# Github to csv converter
This notebook converts the github dataset bike-sharing-dataset to a csv file

In [60]:
import git
import os
import pandas as pd
from datetime import datetime, timezone
import json

In [61]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

NUMBER_OF_DATAPOINTS = 20000
SAVE_INTERVAL = 20

END_DATE = datetime(2023, 8, 7, 0, 0, 0, 0, tzinfo=timezone.utc)
END_DATE

datetime.datetime(2023, 8, 7, 0, 0, tzinfo=datetime.timezone.utc)

In [62]:
start_commit = "main"
# Clone the repository
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout(start_commit, force=True)
#repo.remotes.origin.pull()

"Your branch is up to date with 'origin/main'."

In [63]:
# Prepare the output DataFrame
results = {}
id_to_station = {}

# populate the results dictionary with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [64]:
# Check if the data is already there
def get_existing_data():
    # data_file = TARGET_CITY + "/" + "28031717_Dalgångsgatan.csv"
    filename = "AVONDALE ROAD.csv"
    data_file = TARGET_CITY + "/" + filename.replace(" ", "_")
    if os.path.exists(data_file):
        print("Data already exists")
        # check the last date
        df = pd.read_csv(data_file)
        last_day_and_hour = df["datetime"].min()
        last_day_and_hour = datetime.strptime(last_day_and_hour, "%Y-%m-%d %H:%M:%S%z")
        number_of_rows = len(df)
        continuing = True
        print(f"Last date read: {last_day_and_hour}")
    else:
        last_day_and_hour = datetime(1970, 1, 1, 0, 0, 0)
        number_of_rows = 0
        continuing = False
        print("Data does not exist")
    return last_day_and_hour, number_of_rows, continuing

In [65]:
def save_data(results, continuing):
    if not continuing:
        for station_name, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_name.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename.replace(" ", "_")

            df.to_csv(filename, index=False)

    else:
        for station_name, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_name.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename.replace(" ", "_")

            df.to_csv(filename, mode='a', header=False, index=False)
    
    # clear the results
    for station_name in results:
        results[station_name] = []

    return results


## Iterate through the repo
and save the data to a dict with the following structure:
```
{
    "station_id": [
        [num_bikes_available, datetime],
        ...
    ],
    ...
}
```

Doing one year takes about 2h

In [66]:
start_time = datetime.now()
loop_outer = True


while loop_outer:
    loop_inner = True
    i = 0

    repo.git.checkout(start_commit, force=True)
    last_day_and_hour, number_of_rows, continuing = get_existing_data()

    for commit in repo.iter_commits():
        if commit.committed_datetime < END_DATE:
            print(commit.committed_datetime, END_DATE, commit.committed_datetime < END_DATE)
            print("Reached end date")
            results = save_data(results, continuing)
            loop_outer = False
            break

        if continuing and loop_inner:
            if commit.committed_datetime == last_day_and_hour:
                loop_inner = False
                print(f"Continuing from last commit: {number_of_rows} - {commit.hexsha}, {commit.committed_datetime}")
                last_day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
                i = number_of_rows
                continue
            else:
                continue


        # check if there has been a commit this hour
        day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
        if day_and_hour == last_day_and_hour:
            # print(f"Skipping commit: {commit.hexsha}, {commit.committed_datetime}")
            continue
        last_day_and_hour = day_and_hour
        i += 1

        print(f"{i} - Checking commit: {commit.hexsha}, {commit.committed_datetime}, time elapsed: {datetime.now() - start_time}")
        try:
            repo.git.checkout(commit.hexsha, force=True)
        except Exception as e:
            print("Error checking out commit: ", e)
            results = save_data(results, continuing)
            break

        data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
        if os.path.exists(data_file):
            with open(data_file, "r") as f:
                data = f.read()
                data = json.loads(data)

                for feature in data["features"]:
                    try:
                        results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                    except KeyError:
                        continue

        if i % SAVE_INTERVAL == 0:
            results = save_data(results, continuing)
            continuing = True
            print(f"Saved data at {i} commits")

        if i >= NUMBER_OF_DATAPOINTS:
            results = save_data(results, continuing)
            loop_outer = False
            break

Data already exists
Last date read: 2025-01-07 19:49:06+00:00
Continuing from last commit: 20 - 8e3ca6582fbe92b31705d047bd4c970e08def936, 2025-01-07 19:49:06+00:00
21 - Checking commit: 45efef53d58decb7603aff6dfbed2809b99c765d, 2025-01-07 18:49:35+00:00, time elapsed: 0:00:00.689095
22 - Checking commit: ee2cd20c0654ad505f6381413fc6129cedb2b56d, 2025-01-07 17:49:03+00:00, time elapsed: 0:00:01.045292
23 - Checking commit: 14f47fa993400d64943ba26771053b30112444f4, 2025-01-07 16:56:13+00:00, time elapsed: 0:00:01.379827
24 - Checking commit: 5a748e85fe82ef6bd73594b6016243a16a39bc0b, 2025-01-07 15:38:37+00:00, time elapsed: 0:00:01.806963
25 - Checking commit: dc4bd316c0fcde09a6118d5a56795f88a31b5eb0, 2025-01-07 14:15:43+00:00, time elapsed: 0:00:02.133343
26 - Checking commit: ff4c2edb75445720e19ec9926c6d20bac286f260, 2025-01-07 13:25:27+00:00, time elapsed: 0:00:02.526688
27 - Checking commit: 0766e6d35df274c6ac143c13e8a990bf33afcb01, 2025-01-07 12:46:54+00:00, time elapsed: 0:00:02.854