# Github to csv converter
This notebook converts the github dataset bike-sharing-dataset to a csv file

In [97]:
import git
import os
import pandas as pd
from datetime import datetime
import json

In [98]:

# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "dublin"
FILE_NAME = "jcdecaux.geojson"

NUMBER_OF_DATAPOINTS = 1000
SAVE_INTERVAL = 20

In [99]:
start_commit = "94099be29e4f35a08b91ecee3d8a2f09081ef076"
# Clone the repository
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout(start_commit, force=True)
#repo.remotes.origin.pull()

''

In [100]:
# Prepare the output DataFrame
results = {}
id_to_station = {}

# populate the results dictionary with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            name = feature["properties"]["name"]
            results[name] = []

In [101]:
# Check if the data is already there
def get_existing_data():
    # data_file = TARGET_CITY + "/" + "28031717_Dalgångsgatan.csv"
    filename = "AVONDALE ROAD.csv"
    data_file = TARGET_CITY + "/" + filename.replace(" ", "_")
    if os.path.exists(data_file):
        print("Data already exists")
        # check the last date
        df = pd.read_csv(data_file)
        last_day_and_hour = df["datetime"].min()
        last_day_and_hour = datetime.strptime(last_day_and_hour, "%Y-%m-%d %H:%M:%S%z")
        number_of_rows = len(df)
        continuing = True
        print(f"Last date read: {last_day_and_hour}")
    else:
        last_day_and_hour = datetime(1970, 1, 1, 0, 0, 0)
        number_of_rows = 0
        continuing = False
        print("Data does not exist")
    return last_day_and_hour, number_of_rows, continuing

In [102]:
def save_data(results, continuing):
    if not continuing:
        for station_name, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_name.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename.replace(" ", "_")

            df.to_csv(filename, index=False)

    else:
        for station_name, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_name.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename.replace(" ", "_")

            df.to_csv(filename, mode='a', header=False, index=False)
    
    # clear the results
    for station_name in results:
        results[station_name] = []

    return results


## Iterate through the repo
and save the data to a dict with the following structure:
```
{
    "station_id": [
        [num_bikes_available, datetime],
        ...
    ],
    ...
}
```

Doing one year takes about 2h

In [None]:
start_time = datetime.now()
loop_outer = True


while loop_outer:
    loop_inner = True
    i = 0

    repo.git.checkout(start_commit, force=True)
    last_day_and_hour, number_of_rows, continuing = get_existing_data()

    for commit in repo.iter_commits():
        if continuing and loop:
            if commit.committed_datetime == last_day_and_hour:
                loop = False
                print(f"Continuing from last commit: {number_of_rows} - {commit.hexsha}, {commit.committed_datetime}")
                last_day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
                i = number_of_rows
                continue
            else:
                continue


        # check if there has been a commit this hour
        day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
        if day_and_hour == last_day_and_hour:
            # print(f"Skipping commit: {commit.hexsha}, {commit.committed_datetime}")
            continue
        last_day_and_hour = day_and_hour
        i += 1

        print(f"{i} - Checking commit: {commit.hexsha}, {commit.committed_datetime}, time elapsed: {datetime.now() - start_time}")
        try:
            repo.git.checkout(commit.hexsha, force=True)
        except Exception as e:
            print("Error checking out commit: ", e)
            results = save_data(results, continuing)
            break

        data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
        if os.path.exists(data_file):
            with open(data_file, "r") as f:
                data = f.read()
                data = json.loads(data)

                for feature in data["features"]:
                    try:
                        results[feature["properties"]["name"]].append([feature["properties"]["available_bikes"], commit.committed_datetime])
                    except KeyError:
                        continue

        if i % SAVE_INTERVAL == 0:
            results = save_data(results, continuing)
            continuing = True
            print(f"Saved data at {i} commits")

        if i >= NUMBER_OF_DATAPOINTS:
            results = save_data(results, continuing)
            loop_outer = False
            break

Data already exists
Last date read: 2024-12-01 00:31:45+00:00
Continuing from last commit: 780 - f81fd9c89c2e09129372f4e478b7423fd9d03a0f, 2024-12-01 00:31:45+00:00
781 - Checking commit: 9e30d7b18e67dfa56bd1074851edcfdb6a136437, 2024-11-30 23:48:58+00:00, time elapsed: 0:00:01.308267
782 - Checking commit: 313241a73b3c38d39a04c2a04fe8b56b2aa09d07, 2024-11-30 22:49:13+00:00, time elapsed: 0:00:01.678511
783 - Checking commit: 6f2e34c1ed896674e3af7caf8176b1bfcd6a0a4f, 2024-11-30 21:49:14+00:00, time elapsed: 0:00:02.146517
784 - Checking commit: bc49d2b75a952adcfbbed833b71b668208e91940, 2024-11-30 20:49:09+00:00, time elapsed: 0:00:02.596180
785 - Checking commit: 7bb95c9ec4d1c1768c93558386cddc17d79c9d92, 2024-11-30 19:48:52+00:00, time elapsed: 0:00:03.010051
786 - Checking commit: 525fabcff5c2203e4727b26fe5bc8ce87a61cec3, 2024-11-30 18:50:32+00:00, time elapsed: 0:00:03.417214
787 - Checking commit: aa39d534d40edaeb8dab9a5af412bcfbec906b4b, 2024-11-30 17:49:08+00:00, time elapsed: 0:0