# Download Chicago Taxi Trips

## Install Google CLI

```bash
# install the client
curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
tar -xf google-cloud-cli-linux-x86_64.tar.gz
./google-cloud-sdk/install.sh

# authenticate
./google-cloud-sdk/bin/gcloud init
./google-cloud-sdk/bin/gcloud auth application-default login
```

In [1]:
pip install --upgrade google-cloud-bigquery

Note: you may need to restart the kernel to use updated packages.


In [2]:
import time
from datetime import datetime

import pandas as pd
from google.cloud import bigquery

## 0. BigQuery Helper

In [3]:
class BigQueryHelper(object):
    """
    Helper class to simplify common BigQuery tasks like executing queries,
    showing table schemas, etc without worrying about table or dataset pointers.

    See the BigQuery docs for details of the steps this class lets you skip:
    https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html
    """

    def __init__(self, active_project, dataset_name, max_wait_seconds=180):
        self.project_name = active_project
        self.dataset_name = dataset_name
        self.max_wait_seconds = max_wait_seconds
        self.client = bigquery.Client(active_project)
        self.__dataset_ref = self.client.dataset(self.dataset_name, project=self.project_name)
        self.dataset = None
        self.tables = dict()  # {table name (str): table object}
        self.__table_refs = dict()  # {table name (str): table reference}
        self.total_gb_used_net_cache = 0
        self.BYTES_PER_GB = 2**30

    def __fetch_dataset(self):
        """
        Lazy loading of dataset. For example,
        if the user only calls `self.query_to_pandas` then the
        dataset never has to be fetched.
        """
        if self.dataset is None:
            self.dataset = self.client.get_dataset(self.__dataset_ref)

    def __fetch_table(self, table_name):
        """
        Lazy loading of table
        """
        self.__fetch_dataset()
        if table_name not in self.__table_refs:
            self.__table_refs[table_name] = self.dataset.table(table_name)
        if table_name not in self.tables:
            self.tables[table_name] = self.client.get_table(self.__table_refs[table_name])

    def __handle_record_field(self, row, schema_details, top_level_name=''):
        """
        Unpack a single row, including any nested fields.
        """
        name = row['name']
        if top_level_name != '':
            name = top_level_name + '.' + name
        schema_details.append([{
            'name': name,
            'type': row['type'],
            'mode': row['mode'],
            'fields': pd.np.nan,
            'description': row['description']
                               }])
        # float check is to dodge row['fields'] == np.nan
        if type(row.get('fields', 0.0)) == float:
            return None
        for entry in row['fields']:
            self.__handle_record_field(entry, schema_details, name)

    def __unpack_all_schema_fields(self, schema):
        """
        Unrolls nested schemas. Returns dataframe with one row per field,
        and the field names in the format accepted by the API.
        Results will look similar to the website schema, such as:
            https://bigquery.cloud.google.com/table/bigquery-public-data:github_repos.commits?pli=1

        Args:
            schema: DataFrame derived from api repr of raw table.schema
        Returns:
            Dataframe of the unrolled schema.
        """
        schema_details = []
        schema.apply(lambda row:
            self.__handle_record_field(row, schema_details), axis=1)
        result = pd.concat([pd.DataFrame.from_dict(x) for x in schema_details])
        result.reset_index(drop=True, inplace=True)
        del result['fields']
        return result

    def table_schema(self, table_name):
        """
        Get the schema for a specific table from a dataset.
        Unrolls nested field names into the format that can be copied
        directly into queries. For example, for the `github.commits` table,
        the this will return `committer.name`.

        This is a very different return signature than BigQuery's table.schema.
        """
        self.__fetch_table(table_name)
        raw_schema = self.tables[table_name].schema
        schema = pd.DataFrame.from_dict([x.to_api_repr() for x in raw_schema])
        # the api_repr only has the fields column for tables with nested data
        if 'fields' in schema.columns:
            schema = self.__unpack_all_schema_fields(schema)
        # Set the column order
        schema = schema[['name', 'type', 'mode', 'description']]
        return schema

    def list_tables(self):
        """
        List the names of the tables in a dataset
        """
        self.__fetch_dataset()
        return([x.table_id for x in self.client.list_tables(self.dataset)])

    def estimate_query_size(self, query):
        """
        Estimate gigabytes scanned by query.
        Does not consider if there is a cached query table.
        See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.dryRun
        """
        my_job_config = bigquery.job.QueryJobConfig()
        my_job_config.dry_run = True
        my_job = self.client.query(query, job_config=my_job_config)
        return my_job.total_bytes_processed / self.BYTES_PER_GB

    def query_to_pandas(self, query):
        """
        Execute a SQL query & return a pandas dataframe
        """
        my_job = self.client.query(query)
        start_time = time.time()
        while not my_job.done():
            if (time.time() - start_time) > self.max_wait_seconds:
                print("Max wait time elapsed, query cancelled.")
                self.client.cancel_job(my_job.job_id)
                return None
            time.sleep(0.1)
        # Queries that hit errors will return an exception type.
        # Those exceptions don't get raised until we call my_job.to_dataframe()
        # In that case, my_job.total_bytes_billed can be called but is None
        if my_job.total_bytes_billed:
            self.total_gb_used_net_cache += my_job.total_bytes_billed / self.BYTES_PER_GB
        return my_job.to_dataframe()

    def query_to_pandas_safe(self, query, max_gb_scanned=1):
        """
        Execute a query, but only if the query would scan less than `max_gb_scanned` of data.
        """
        query_size = self.estimate_query_size(query)
        if query_size <= max_gb_scanned:
            return self.query_to_pandas(query)
        msg = "Query cancelled; estimated size of {0} exceeds limit of {1} GB"
        print(msg.format(query_size, max_gb_scanned))

    def head(self, table_name, num_rows=5, start_index=None, selected_columns=None):
        """
        Get the first n rows of a table as a DataFrame.
        Does not perform a full table scan; should use a trivial amount of data as long as n is small.
        """
        self.__fetch_table(table_name)
        active_table = self.tables[table_name]
        schema_subset = None
        if selected_columns:
            schema_subset = [col for col in active_table.schema if col.name in selected_columns]
        results = self.client.list_rows(active_table, selected_fields=schema_subset,
            max_results=num_rows, start_index=start_index)
        results = [x for x in results]
        return pd.DataFrame(
            data=[list(x.values()) for x in results], columns=list(results[0].keys()))

## 1. Configuration

In [9]:
output_path = "/data/data/chicago_taxi"
project_id = "mlrun-trino"  # replace with your GCP project ID

## 2. BigQuery Connection

In [10]:
client = bigquery.Client(project=project_id)

In [12]:
query = """
SELECT  min(trip_start_timestamp) FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` LIMIT 1000
"""

In [13]:
query_job = client.query(query)

In [14]:
results = query_job.result()

In [15]:
results

<google.cloud.bigquery.table.RowIterator at 0x726e53103b20>

In [19]:
bq_assistant = BigQueryHelper(active_project="mlrun-trino",
                              dataset_name="chicago_taxi_trips")

In [20]:
# confirm the connection is up by listing the tables
bq_assistant.list_tables()

NotFound: 404 GET https://bigquery.googleapis.com/bigquery/v2/projects/mlrun-trino/datasets/chicago_taxi_trips?prettyPrint=false: Not found: Dataset mlrun-trino:chicago_taxi_trips

## 3. Preview Data

In [None]:
bq_assistant.head("taxi_trips", num_rows=3)

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,9f4d762848799be329e007ae695297ddf2ec9e68,53b9f89457dbe76abd2c5d0d7f00e0c2bf48a1e1035122...,2019-09-09 15:00:00+00:00,2019-09-09 15:15:00+00:00,1279,14.39,,,,,...,24.0,60.75,Credit Card,Flash Cab,,,,,,
1,9f28593d41d91fcf46c81ab9395c2dc8a6a26c9a,a7108907c04f278905344870b79e80244259d14f1335b5...,2019-09-04 07:15:00+00:00,2019-09-04 07:45:00+00:00,1920,14.4,,,,,...,0.0,44.2,Credit Card,Star North Management LLC,,,,,,
2,9dd67852ff43243a3121d24ea21ac5f84dd798a1,81d652b4d1a83430b2ecbdd8f126b135f9f37b6115b7a5...,2019-09-05 07:15:00+00:00,2019-09-05 07:30:00+00:00,459,1.0,,,,,...,0.0,6.5,Cash,Taxi Affiliation Service Yellow,,,,,,


In [None]:
bq_assistant.table_schema("taxi_trips")

Unnamed: 0,name,type,mode,description
0,unique_key,STRING,REQUIRED,Unique identifier for the trip.
1,taxi_id,STRING,REQUIRED,A unique identifier for the taxi.
2,trip_start_timestamp,TIMESTAMP,NULLABLE,"When the trip started, rounded to the nearest ..."
3,trip_end_timestamp,TIMESTAMP,NULLABLE,"When the trip ended, rounded to the nearest 15..."
4,trip_seconds,INTEGER,NULLABLE,Time of the trip in seconds.
5,trip_miles,FLOAT,NULLABLE,Distance of the trip in miles.
6,pickup_census_tract,INTEGER,NULLABLE,The Census Tract where the trip began. For pri...
7,dropoff_census_tract,INTEGER,NULLABLE,The Census Tract where the trip ended. For pri...
8,pickup_community_area,INTEGER,NULLABLE,The Community Area where the trip began.
9,dropoff_community_area,INTEGER,NULLABLE,The Community Area where the trip ended.


## 4. Download Chicago Taxi Trips

In [None]:
# ---- parameters you can tweak ----
TABLE_FQN = "`bigquery-public-data.chicago_taxi_trips.taxi_trips`"
START = pd.Timestamp("2023-01-01")
END   = pd.Timestamp.today().normalize()  # up to today; change if you want a fixed cutoff
OUTDIR = output_path
# BigQuery billed bytes guard per chunk; raise/lower if needed
MAX_GB_SCANNED = 100   # per month-chunk limit for query_to_pandas_safe
# ---------------------------------

In [None]:
months = pd.period_range(START, END, freq="M")
saved = []

for p in months:
    month_start = pd.Timestamp(p.start_time)
    month_end   = (month_start + pd.offsets.MonthBegin(1))  # exclusive upper bound

    query = f"""
    SELECT *
    FROM {TABLE_FQN}
    WHERE trip_start_timestamp >= TIMESTAMP('{month_start.strftime("%Y-%m-%d")} 00:00:00')
      AND trip_start_timestamp <  TIMESTAMP('{month_end.strftime("%Y-%m-%d")} 00:00:00')
    """

    # Pull this month's rows into a DataFrame (safeguarded by MAX_GB_SCANNED)
    df = bq_assistant.query_to_pandas_safe(query, max_gb_scanned=MAX_GB_SCANNED)

    # Skip empty months gracefully
    if df is None or len(df) == 0:
        print(f"[skip] {p} -> no rows returned")
        continue

    # Write compressed parquet (gzip) into /kaggle/working
    # File name like: chicago_taxi_2023_01.parquet.gzip
    out_path = f"{OUTDIR}/chicago_taxi_{p.year:04d}_{p.month:02d}.parquet"
    df.to_parquet(out_path,compression="snappy")
    print(f"[ok] saved {out_path} (rows={len(df)})")
    saved.append(out_path)

Forbidden: 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/bigquery-public-data/jobs?prettyPrint=false: Access Denied: Project bigquery-public-data: User does not have bigquery.jobs.create permission in project bigquery-public-data.

Location: None
Job ID: bce471f1-d3d4-4ba2-b8dd-f45231480c1d


In [None]:
# Optional: manifest of all part files
pd.Series(saved).to_csv(f"{OUTDIR}/chicago_taxi_2023_onward_manifest.txt", index=False)
print(f"\nWrote manifest with {len(saved)} files to {OUTDIR}/chicago_taxi_2023_onward_manifest.txt")