# Github to csv converter
This notebook converts the github dataset bike-sharing-dataset to a csv file

In [1]:
import git
import os
import pandas as pd
from datetime import datetime
import json

In [2]:
# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

NUMBER_OF_DATAPOINTS = 20000
SAVE_INTERVAL = 20

In [3]:
start_commit = "247c23add0f238d4d289bf276faf70806b353ae4"
# Clone the repository
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout(start_commit, force=True)
#repo.remotes.origin.pull()

''

In [4]:
# Prepare the output DataFrame
results = {}
id_to_station = {}

# populate the results dictionary with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [5]:
# Check if the data is already there
def get_existing_data():
    # data_file = TARGET_CITY + "/" + "28031717_Dalgångsgatan.csv"
    filename = "AVONDALE ROAD.csv"
    data_file = TARGET_CITY + "/" + filename.replace(" ", "_")
    if os.path.exists(data_file):
        print("Data already exists")
        # check the last date
        df = pd.read_csv(data_file)
        last_day_and_hour = df["datetime"].min()
        last_day_and_hour = datetime.strptime(last_day_and_hour, "%Y-%m-%d %H:%M:%S%z")
        number_of_rows = len(df)
        continuing = True
        print(f"Last date read: {last_day_and_hour}")
    else:
        last_day_and_hour = datetime(1970, 1, 1, 0, 0, 0)
        number_of_rows = 0
        continuing = False
        print("Data does not exist")
    return last_day_and_hour, number_of_rows, continuing

In [6]:
def save_data(results, continuing):
    if not continuing:
        for station_name, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_name.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename.replace(" ", "_")

            df.to_csv(filename, index=False)

    else:
        for station_name, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_name.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename.replace(" ", "_")

            df.to_csv(filename, mode='a', header=False, index=False)
    
    # clear the results
    for station_name in results:
        results[station_name] = []

    return results


## Iterate through the repo
and save the data to a dict with the following structure:
```
{
    "station_id": [
        [num_bikes_available, datetime],
        ...
    ],
    ...
}
```

Doing one year takes about 2h

In [7]:
start_time = datetime.now()
loop_outer = True


while loop_outer:
    loop_inner = True
    i = 0

    repo.git.checkout(start_commit, force=True)
    last_day_and_hour, number_of_rows, continuing = get_existing_data()

    for commit in repo.iter_commits():
        if continuing and loop_inner:
            if commit.committed_datetime == last_day_and_hour:
                loop_inner = False
                print(f"Continuing from last commit: {number_of_rows} - {commit.hexsha}, {commit.committed_datetime}")
                last_day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
                i = number_of_rows
                continue
            else:
                continue


        # check if there has been a commit this hour
        day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
        if day_and_hour == last_day_and_hour:
            # print(f"Skipping commit: {commit.hexsha}, {commit.committed_datetime}")
            continue
        last_day_and_hour = day_and_hour
        i += 1

        print(f"{i} - Checking commit: {commit.hexsha}, {commit.committed_datetime}, time elapsed: {datetime.now() - start_time}")
        try:
            repo.git.checkout(commit.hexsha, force=True)
        except Exception as e:
            print("Error checking out commit: ", e)
            results = save_data(results, continuing)
            break

        data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
        if os.path.exists(data_file):
            with open(data_file, "r") as f:
                data = f.read()
                data = json.loads(data)

                for feature in data["features"]:
                    try:
                        results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                    except KeyError:
                        continue

        if i % SAVE_INTERVAL == 0:
            results = save_data(results, continuing)
            continuing = True
            print(f"Saved data at {i} commits")

        if i >= NUMBER_OF_DATAPOINTS:
            results = save_data(results, continuing)
            loop_outer = False
            break

Data does not exist
1 - Checking commit: 247c23add0f238d4d289bf276faf70806b353ae4, 2025-01-06 15:06:11+00:00, time elapsed: 0:00:00.284699
2 - Checking commit: 8a67406adc70ce4a19cf7c18b85f7543a9b78607, 2025-01-06 14:48:56+00:00, time elapsed: 0:00:00.369440
3 - Checking commit: 8f5f882f0bd6d9ca1384cf2e025f6da7a3a98e9d, 2025-01-06 13:49:04+00:00, time elapsed: 0:00:00.629430
4 - Checking commit: 5603e0c16ea588da39e7a213c174ce6e6dae8337, 2025-01-06 12:46:44+00:00, time elapsed: 0:00:00.955922
5 - Checking commit: 1f62bd71c271004268c3a158c4e6c9d8de04e4bd, 2025-01-06 11:49:15+00:00, time elapsed: 0:00:01.368806
6 - Checking commit: 32df33ebc776d7d417289de3562f8cdf117223b7, 2025-01-06 10:49:17+00:00, time elapsed: 0:00:01.738643
7 - Checking commit: 1f47d20a5cb5371083475732ebefcecd6b56a51f, 2025-01-06 09:49:04+00:00, time elapsed: 0:00:02.061768
8 - Checking commit: b9164118793880850da4aa62c56e36790b873cf0, 2025-01-06 08:49:58+00:00, time elapsed: 0:00:02.454781
9 - Checking commit: a2293b2

KeyboardInterrupt: 