In [124]:
from math import ceil
from pathlib import Path
import pandas as pd
import numpy as np
from random import randint
import csv

In [None]:
dir_main = Path('/Users/neelagarwal/Desktop/CITIBike Data/')
dir_nyc = dir_main / 'NYC'
dir_jersey = dir_main / 'Jersey'

def iter_subfiles(parent: Path, year: int):
    dir_year = parent / f'{year}'

    if parent.stem == 'NYC':
        for dir_month in dir_year.iterdir():
            dir_month: Path
            if dir_month.is_dir():
                for csv in dir_month.iterdir():
                    csv: Path
                    if csv.is_file() and csv.suffix == '.csv':
                        yield csv.resolve()
                        
    elif parent.stem == 'Jersey':
        for csv in dir_year.iterdir():
            csv: Path
            if csv.is_file() and csv.suffix == '.csv':
                yield csv.resolve()


def csv_to_df(parent: Path, year: int, selected_file_num: int | None = None, just_headers: bool = False) -> pd.DataFrame:
    selected_file_num = 0 if selected_file_num is None else selected_file_num
    for num, file in enumerate(iter_subfiles(parent, year)):
        if num == selected_file_num:
            if just_headers:
                with open(file) as f:
                    csv_reader = csv.reader(f)
                    return next(csv_reader)
            elif not just_headers:
                return pd.read_csv(file, low_memory = False)

def generate_randint(start: int, stop: int):
    while True:
        yield randint(start, stop)

random_int = generate_randint(0, 2)

def get_all_columns(parent: Path, years: list[int]):
    all_cols = [csv_to_df(parent, year, next(random_int), just_headers = True) for year in years]
    return [list(cols) if cols else None for cols in all_cols]

In [139]:
years = list(range(2013, 2017)) + list(range(2023, 2026))
for col_set, year in zip(get_all_columns(dir_nyc, years), years):
    print(f'{year} ({len(col_set)}): {col_set}')

2013 (15): ['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end station id', 'end station name', 'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender']
2014 (15): ['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end station id', 'end station name', 'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender']
2015 (15): ['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end station id', 'end station name', 'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender']
2016 (15): ['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end stat

In [140]:
years = [2016, 2023, 2024, 2025]
for i in zip(get_all_columns(dir_jersey, years), years):
    print(f'{year} ({len(col_set)}): {col_set}')

2025 (13): ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
2025 (13): ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
2025 (13): ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
2025 (13): ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']


In [141]:
csv_to_df(dir_nyc, 2023, 1)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,6C49104FDFC0CF09,classic_bike,2023-07-19 17:59:05.597,2023-07-19 18:05:37.629,11 Ave & W 41 St,6726.01,W 34 St & Hudson Blvd E,6535.04,40.760301,-73.998842,40.755167,-74.000599,member
1,CFBF8CF54A038A38,classic_bike,2023-07-20 19:12:47.568,2023-07-20 19:15:34.685,11 Ave & W 41 St,6726.01,Hudson Blvd W & W 36 St,6611.07,40.760301,-73.998842,40.756765,-73.999714,member
2,065FB5CE9AAE5ECD,classic_bike,2023-07-02 04:27:50.142,2023-07-02 04:44:01.609,Bond St & Fulton St,4479.06,Butler St & Court St,4339.01,40.689622,-73.983043,40.684989,-73.994403,member
3,C7244641EDCCA867,classic_bike,2023-07-01 10:58:11.347,2023-07-01 11:30:48.091,Lafayette Ave & Classon Ave,4452.01,Washington Ave & Empire Blvd,3704.01,40.689004,-73.960239,40.663140,-73.960570,member
4,6F1250E24813C876,classic_bike,2023-07-26 12:00:09.008,2023-07-26 12:14:26.371,Sterling St & Bedford Ave,3665.06,Rugby Rd & Albemarle Rd,3188.04,40.662706,-73.956912,40.646720,-73.966370,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,61E52C8D7AC91795,electric_bike,2023-07-05 07:47:58.682,2023-07-05 07:59:25.958,E 77 St & 1 Ave,7020.02,E 53 St & Madison Ave,6659.03,40.770772,-73.953415,40.759711,-73.974023,member
999996,9CF5F272AD48A15A,classic_bike,2023-07-13 23:15:51.553,2023-07-13 23:17:52.819,Henry St & Remsen St,4645.04,Schermerhorn St & Court St,4565.07,40.694010,-73.994651,40.691036,-73.992011,casual
999997,F2A6F8725084D60F,electric_bike,2023-07-03 20:38:21.987,2023-07-03 20:42:42.366,Henry St & Remsen St,4645.04,Henry St & Degraw St,4380.08,40.693833,-73.994775,40.684751,-73.999173,casual
999998,6447546AB31A7524,electric_bike,2023-07-23 23:53:41.357,2023-07-23 23:55:34.633,Henry St & Remsen St,4645.04,Schermerhorn St & Court St,4565.07,40.694010,-73.994651,40.691036,-73.992011,casual


In [None]:
df = pd.DataFrame()

chunk_size = int(1e5)
size = df.shape[0]
num_chunks = ceil(size / chunk_size)

df_chunks = []
for i in range(num_chunks):
    chunk = df.loc[i * size : (i+1) * size]
    df_chunks.append(chunk)

In [41]:
test_df = pd.DataFrame()

for df in df_chunks:
    test_df = pd.concat([test_df, df])

In [42]:
test_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5511719F60A614AA,classic_bike,2023-07-10 20:07:52.959,2023-07-10 20:20:16.918,E 27 St & Park Ave S,6131.13,1 Ave & E 39 St,6303.01,40.742752,-73.984716,40.747140,-73.971130,casual
1,0747E080C54F7AAE,classic_bike,2023-07-17 15:30:30.216,2023-07-17 15:33:10.123,South St & Gouverneur Ln,4953.04,Maiden Ln & Pearl St,5065.10,40.703554,-74.006702,40.707065,-74.007319,member
2,1243D8C21B002936,classic_bike,2023-07-30 16:21:26.583,2023-07-30 16:38:17.095,South St & Gouverneur Ln,4953.04,St Marks Pl & 2 Ave,5669.10,40.703554,-74.006702,40.728419,-73.987140,member
3,82B9DB6B37D6104F,classic_bike,2023-07-31 17:22:49.480,2023-07-31 17:33:48.668,Forsyth St & Canal St,5270.07,Maiden Ln & Pearl St,5065.10,40.715815,-73.994224,40.707065,-74.007319,member
4,23E978F5180198F1,classic_bike,2023-07-11 18:25:52.841,2023-07-11 18:36:15.167,34 Ave & 45 St,6596.10,31 St & 23 Ave,7144.01,40.754795,-73.917014,40.774788,-73.912555,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
659576,832CA1B4855C70A5,electric_bike,2023-07-13 05:27:33.831,2023-07-13 05:54:28.704,Irving Ave & Harman St,4856.05,48 St & Broadway,6629.06,40.700916,-73.918112,40.755763,-73.912715,member
659577,8557BC1F7CCA1C6A,electric_bike,2023-07-11 19:06:35.962,2023-07-11 19:12:58.083,Madison St & Clinton St,5190.07,Canal St & Rutgers St,5303.08,40.712663,-73.987653,40.714275,-73.989900,member
659578,BE271830E3562BE2,electric_bike,2023-07-17 17:09:37.585,2023-07-17 17:15:17.400,N Moore St & Hudson St,5470.02,Liberty St & Broadway,5105.01,40.719961,-74.008443,40.709056,-74.010434,member
659579,90C7E9A95C19F0EA,electric_bike,2023-07-25 15:17:57.952,2023-07-25 15:21:48.394,N Moore St & Hudson St,5470.02,Liberty St & Broadway,5105.01,40.719931,-74.008355,40.709056,-74.010434,member
