# Github to csv converter
This notebook converts the github dataset bike-sharing-dataset to a csv file

In [68]:
import git
import os
import pandas as pd
from datetime import datetime
import json

In [69]:

# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "Gothenburg"
NUMBER_OF_DATAPOINTS = 6000
SAVE_INTERVAL = 20

In [70]:
start_commit = "94099be29e4f35a08b91ecee3d8a2f09081ef076"
# Clone the repository
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout(start_commit, force=True)
#repo.remotes.origin.pull()

''

In [71]:
# Prepare the output DataFrame
results = {}
id_to_station = {}

# populate the results dictionary with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/gothenburg/styr--stall.geojson")
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            station_id = feature["properties"]["station_id"]
            results[station_id] = []
            id_to_station[station_id] = feature["properties"]["name"]

In [72]:
# Check if the data is already there
def get_existing_data():
    data_file = "clean_data/28031717_Dalgångsgatan.csv"
    if os.path.exists(data_file):
        print("Data already exists")
        # check the last date
        df = pd.read_csv(data_file)
        last_day_and_hour = df["datetime"].min()
        last_day_and_hour = datetime.strptime(last_day_and_hour, "%Y-%m-%d %H:%M:%S%z")
        number_of_rows = len(df)
        continuing = True
        print(f"Last date read: {last_day_and_hour}")
    else:
        last_day_and_hour = datetime(1970, 1, 1, 0, 0, 0)
        number_of_rows = 0
        continuing = False
        print("Data does not exist")
    return last_day_and_hour, number_of_rows, continuing

In [73]:
def save_data(results, id_to_station, continuing):
    if not continuing:
        for station_id, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])
            station = id_to_station[station_id]

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_id}_{station.replace('/', '_')}.csv"
            filename = "clean_data/" + filename

            df.to_csv(filename, index=False)

    else:
        for station_id, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])
            station = id_to_station[station_id]

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_id}_{station.replace('/', '_')}.csv"
            filename = "clean_data/" + filename

            df.to_csv(filename, mode='a', header=False, index=False)
    
    # clear the results
    for station_id in results:
        results[station_id] = []

    return results


## Iterate through the repo
and save the data to a dict with the following structure:
```
{
    "station_id": [
        [num_bikes_available, datetime],
        ...
    ],
    ...
}
```

Doing one year takes about 2h

In [None]:
start_time = datetime.now()

while True:
    loop = True
    i = 0

    repo.git.checkout(start_commit, force=True)
    last_day_and_hour, number_of_rows, continuing = get_existing_data()

    for commit in repo.iter_commits():
        if continuing and loop:
            if commit.committed_datetime == last_day_and_hour:
                loop = False
                print(f"Continuing from last commit: {number_of_rows} - {commit.hexsha}, {commit.committed_datetime}")
                last_day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
                i = number_of_rows
                continue
            else:
                continue


        # check if there has been a commit this hour
        day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
        if day_and_hour == last_day_and_hour:
            # print(f"Skipping commit: {commit.hexsha}, {commit.committed_datetime}")
            continue
        last_day_and_hour = day_and_hour
        i += 1

        print(f"{i} - Checking commit: {commit.hexsha}, {commit.committed_datetime}, time elapsed: {datetime.now() - start_time}")
        try:
            repo.git.checkout(commit.hexsha, force=True)
        except Exception as e:
            print("Error checking out commit: ", e)
            results = save_data(results, id_to_station, continuing)
            break

        data_file = os.path.join(CLONE_DIR, "data/stations/gothenburg/styr--stall.geojson")
        if os.path.exists(data_file):
            with open(data_file, "r") as f:
                data = f.read()
                data = json.loads(data)
                # print(data["features"][0]["properties"]["num_bikes_available"])
                # print(len(data["features"]))
                for feature in data["features"]:
                    results[feature["properties"]["station_id"]].append([feature["properties"]["num_bikes_available"], commit.committed_datetime])

        if i % SAVE_INTERVAL == 0:
            results = save_data(results, id_to_station, continuing)
            print(f"Saved {i} datapoints")

        if i >= NUMBER_OF_DATAPOINTS:
            results = save_data(results, id_to_station, continuing)
            break

Data already exists
Last date read: 2024-10-01 04:49:34+00:00
Continuing from last commit: 2240 - 61acaf944483d97a3041f3503d9057515508b7fa, 2024-10-01 04:49:34+00:00
2241 - Checking commit: 707d4404b46e48dac1e43fc30ec648c974a217a7, 2024-10-01 03:51:26+00:00, time elapsed: 0:00:01.043603
2242 - Checking commit: 5957af5f07e6b9752350beef720369da2fa0f645, 2024-10-01 02:52:45+00:00, time elapsed: 0:00:01.391947
2243 - Checking commit: cfa07682ef0838b1e0c3c4cd11369a2853cbbff6, 2024-10-01 01:33:52+00:00, time elapsed: 0:00:01.861725
2244 - Checking commit: 0f51c5c38654567cbd6315b2493ccf157bb84584, 2024-10-01 00:50:36+00:00, time elapsed: 0:00:02.288833
2245 - Checking commit: 27a543d16a63ac7e8e378e56e7bb7f079b1870b6, 2024-09-30 23:49:10+00:00, time elapsed: 0:00:02.716977
2246 - Checking commit: ad5c0fe1b5660fdab28b463f35c60ee2e24b59a5, 2024-09-30 22:49:17+00:00, time elapsed: 0:00:03.634121
2247 - Checking commit: b6a73b3916c1742771be813822129b4243754fc9, 2024-09-30 21:48:56+00:00, time elap

## Store the data to a csv file
One file per station, where each row is a snapshot of the station's status at a given time.

In [17]:
results = save_data(results)