# Github to csv converter
This notebook converts the github dataset bike-sharing-dataset to a csv file

In [1]:
import git
import os
import pandas as pd
from datetime import datetime
import json

In [2]:

# Configuration
REPO_URL = "https://github.com/MaxHalford/bike-sharing-history"
CLONE_DIR = "./bike_data/bike-sharing-history"
TARGET_CITY = "gothenburg"
FILE_NAME = "styr--stall.geojson"

NUMBER_OF_DATAPOINTS = 6000
SAVE_INTERVAL = 20

In [3]:
start_commit = "94099be29e4f35a08b91ecee3d8a2f09081ef076"
# Clone the repository
if not os.path.exists(CLONE_DIR):
    print("Cloning repository...")
    git.Repo.clone_from(REPO_URL, CLONE_DIR)
repo = git.Repo(CLONE_DIR)

# go to main and pull the latest changes
repo.git.checkout(start_commit, force=True)
#repo.remotes.origin.pull()

''

In [4]:
# Prepare the output DataFrame
results = {}
id_to_station = {}

# populate the results dictionary with the stations
data_file = os.path.join(CLONE_DIR, "data/stations/" + TARGET_CITY + "/" + FILE_NAME)
if os.path.exists(data_file):
    with open(data_file, "r") as f:
        data = f.read()
        data = json.loads(data)
        for feature in data["features"]:
            station_id = feature["properties"]["station_id"]
            results[station_id] = []
            id_to_station[station_id] = feature["properties"]["name"]

In [5]:
# Check if the data is already there
def get_existing_data():
    data_file = TARGET_CITY + "/" + "28031717_Dalgångsgatan.csv"
    if os.path.exists(data_file):
        print("Data already exists")
        # check the last date
        df = pd.read_csv(data_file)
        last_day_and_hour = df["datetime"].min()
        last_day_and_hour = datetime.strptime(last_day_and_hour, "%Y-%m-%d %H:%M:%S%z")
        number_of_rows = len(df)
        continuing = True
        print(f"Last date read: {last_day_and_hour}")
    else:
        last_day_and_hour = datetime(1970, 1, 1, 0, 0, 0)
        number_of_rows = 0
        continuing = False
        print("Data does not exist")
    return last_day_and_hour, number_of_rows, continuing

In [6]:
def save_data(results, id_to_station, continuing):
    if not continuing:
        for station_id, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])
            station = id_to_station[station_id]

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_id}_{station.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename

            df.to_csv(filename, index=False)

    else:
        for station_id, data in results.items():
            df = pd.DataFrame(data, columns=["num_bikes_available", "datetime"])
            station = id_to_station[station_id]

            # station names may have slashes, which are not allowed in filenames
            filename = f"{station_id}_{station.replace('/', '_')}.csv"
            filename = TARGET_CITY + "/" + filename

            df.to_csv(filename, mode='a', header=False, index=False)
    
    # clear the results
    for station_id in results:
        results[station_id] = []

    return results


## Iterate through the repo
and save the data to a dict with the following structure:
```
{
    "station_id": [
        [num_bikes_available, datetime],
        ...
    ],
    ...
}
```

Doing one year takes about 2h

In [7]:
start_time = datetime.now()

while True:
    loop = True
    i = 0

    repo.git.checkout(start_commit, force=True)
    last_day_and_hour, number_of_rows, continuing = get_existing_data()

    for commit in repo.iter_commits():
        if continuing and loop:
            if commit.committed_datetime == last_day_and_hour:
                loop = False
                print(f"Continuing from last commit: {number_of_rows} - {commit.hexsha}, {commit.committed_datetime}")
                last_day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
                i = number_of_rows
                continue
            else:
                continue


        # check if there has been a commit this hour
        day_and_hour = commit.committed_datetime.replace(minute=0, second=0, microsecond=0)
        if day_and_hour == last_day_and_hour:
            # print(f"Skipping commit: {commit.hexsha}, {commit.committed_datetime}")
            continue
        last_day_and_hour = day_and_hour
        i += 1

        print(f"{i} - Checking commit: {commit.hexsha}, {commit.committed_datetime}, time elapsed: {datetime.now() - start_time}")
        try:
            repo.git.checkout(commit.hexsha, force=True)
        except Exception as e:
            print("Error checking out commit: ", e)
            results = save_data(results, id_to_station, continuing)
            break

        data_file = os.path.join(CLONE_DIR, "data/stations/gothenburg/styr--stall.geojson")
        if os.path.exists(data_file):
            with open(data_file, "r") as f:
                data = f.read()
                data = json.loads(data)
                # print(data["features"][0]["properties"]["num_bikes_available"])
                # print(len(data["features"]))
                for feature in data["features"]:
                    try:
                        results[feature["properties"]["station_id"]].append([feature["properties"]["num_bikes_available"], commit.committed_datetime])
                    except KeyError:
                        continue

        if i % SAVE_INTERVAL == 0:
            results = save_data(results, id_to_station, continuing)
            print(f"Saved {i} datapoints")

        if i >= NUMBER_OF_DATAPOINTS:
            results = save_data(results, id_to_station, continuing)
            break

Data already exists
Last date read: 2024-08-30 12:58:24+00:00
Continuing from last commit: 3000 - df068e82ec46cac52694459ecd99f5d1d072c51b, 2024-08-30 12:58:24+00:00
3001 - Checking commit: c5734e174cf95ce6fa58d20e73943259c8d9ea23, 2024-08-30 11:48:54+00:00, time elapsed: 0:00:04.245427
3002 - Checking commit: f34ae4c2f136d9e9b9de85e55cba09ff06a3d824, 2024-08-30 10:49:09+00:00, time elapsed: 0:00:06.126552
3003 - Checking commit: 05a97f1a58563282cb5fc5b96555f81158936339, 2024-08-30 09:49:09+00:00, time elapsed: 0:00:06.577740
3004 - Checking commit: 5c8d96db035c801ee3e0b8927e70df40a6415fd2, 2024-08-30 08:49:07+00:00, time elapsed: 0:00:07.024364
3005 - Checking commit: 4efdbdd35e0a59e595c2c659afa479374defdf33, 2024-08-30 07:49:03+00:00, time elapsed: 0:00:07.492788
3006 - Checking commit: 3ad053d2d40f8689b03bf3626a7e50c5bf32bde8, 2024-08-30 06:49:16+00:00, time elapsed: 0:00:07.954264
3007 - Checking commit: faaf42df47498352df9074d76dbf9296da51d9af, 2024-08-30 05:48:43+00:00, time elap

KeyboardInterrupt: 

## Store the data to a csv file
One file per station, where each row is a snapshot of the station's status at a given time.

In [17]:
results = save_data(results)