# this script is to sample the main dataset. It contains the following:
## 1. Import libraries and data
## 2. Sample data

### 1. Import libraries

In [1]:
# Import libraries
import pandas as pd, numpy as np
from pandas.errors import DtypeWarning
import warnings


In [2]:
# Hide dtype warnings
warnings.filterwarnings('ignore', category=DtypeWarning)

# Columns to keep for sample
usecols = [
    'ride_id','rideable_type','started_at','ended_at',
    'start_station_id','start_station_name','start_lat','start_lng',
    'end_station_id','end_station_name','end_lat','end_lng',
    'member_casual','date','avg_temp'
]

# Explicit dtypes 
dtype_map = {
    'ride_id': 'string',
    'rideable_type': 'category',
    'started_at': 'string',
    'ended_at': 'string',
    'start_station_id': 'string',
    'start_station_name': 'string',
    'end_station_id': 'string',
    'end_station_name': 'string',
    'start_lat': 'float64',
    'start_lng': 'float64',
    'end_lat': 'float64',
    'end_lng': 'float64',
    'member_casual': 'category',
    'date': 'string',
    'avg_temp': 'float64'
}

rng = np.random.default_rng(32)   # reproducible
keep_fraction = 0.004   # about 0.4% of rows â†’ target ~24 MB
parts = []

for chunk in pd.read_csv(
        'citi_bike_clean_2022.csv',
        usecols=usecols,
        dtype=dtype_map,
        chunksize=200_000,
        low_memory=True):
    mask = rng.random(len(chunk)) < keep_fraction
    parts.append(chunk[mask])

small = pd.concat(parts, ignore_index=True)
small.to_csv('citibike_sample.csv', index=False)

# quick size check
import os
print("Sample size:", round(os.path.getsize('citibike_sample.csv')/1024/1024, 2), "MB")


Sample size: 23.78 MB


In [3]:
small.shape

(117822, 15)

In [4]:
small.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'date', 'avg_temp'],
      dtype='object')