diff --git a/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/Dockerfile b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/Dockerfile
new file mode 100644
index 000000000..7265a1b71
--- /dev/null
+++ b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/Dockerfile
@@ -0,0 +1,37 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base image for this build
+FROM python:3.8
+
+# Allow statements and log messages to appear in Cloud logs
+ENV PYTHONUNBUFFERED True
+
+# Copy the requirements file into the image
+COPY requirements.txt ./
+
+# Install the packages specified in the requirements file
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+
+# The WORKDIR instruction sets the working directory for any RUN, CMD,
+# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
+# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
+# any subsequent Dockerfile instruction
+WORKDIR /custom
+
+# Copy the specific data processing script/s in the image under /custom/*
+COPY ./csv_transform.py .
+
+# Command to run the data processing script when the container is run
+CMD ["python3", "csv_transform.py"]
diff --git a/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/csv_transform.py b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/csv_transform.py
new file mode 100644
index 000000000..775e691fb
--- /dev/null
+++ b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/csv_transform.py
@@ -0,0 +1,423 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import pathlib
+import typing
+from datetime import datetime
+
+import pandas as pd
+import requests
+from google.cloud import bigquery, storage
+from google.cloud.exceptions import NotFound
+
+
+def main(
+ source_url: str,
+ source_file: pathlib.Path,
+ target_file: pathlib.Path,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ pipeline_name: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+) -> None:
+ logging.info(f"New York taxi trips - {pipeline_name} process started")
+ pathlib.Path("./files").mkdir(parents=True, exist_ok=True)
+ execute_pipeline(
+ source_url,
+ str(source_file),
+ str(target_file),
+ project_id,
+ dataset_id,
+ table_id,
+ schema_path,
+ chunksize,
+ target_gcs_bucket,
+ target_gcs_path,
+ pipeline_name,
+ input_headers,
+ data_dtypes,
+ output_headers,
+ )
+ logging.info(f"New York taxi trips - {pipeline_name} process completed")
+
+
+def execute_pipeline(
+ source_url: str,
+ source_file: str,
+ target_file: str,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ pipeline_name: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+) -> None:
+ for year_number in range(datetime.now().year, (datetime.now().year - 6), -1):
+ target_file_name = str.replace(target_file, ".csv", f"_{year_number}.csv")
+ process_year_data(
+ source_url,
+ int(year_number),
+ source_file,
+ target_file,
+ target_file_name,
+ project_id,
+ dataset_id,
+ table_id,
+ schema_path,
+ chunksize,
+ target_gcs_bucket,
+ target_gcs_path,
+ pipeline_name,
+ input_headers,
+ data_dtypes,
+ output_headers,
+ )
+
+
+def process_year_data(
+ source_url: str,
+ year_number: int,
+ source_file: str,
+ target_file: str,
+ target_file_name: str,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ pipeline_name: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+) -> None:
+ logging.info(f"Processing year {year_number}")
+ destination_table = f"{table_id}_{year_number}"
+ year_data_available = False
+ for month_number in range(1, 13):
+ month_data_available = process_month(
+ source_url,
+ year_number,
+ month_number,
+ source_file,
+ target_file,
+ target_file_name,
+ chunksize,
+ input_headers,
+ data_dtypes,
+ output_headers,
+ pipeline_name,
+ )
+ if month_data_available:
+ year_data_available = True
+ else:
+ pass
+ if os.path.exists(target_file_name) and year_data_available:
+ upload_file_to_gcs(
+ target_file_name,
+ target_gcs_bucket,
+ str(target_gcs_path).replace(".csv", f"_{year_number}.csv"),
+ )
+ create_dest_table(
+ project_id, dataset_id, destination_table, schema_path, target_gcs_bucket
+ )
+ load_data_to_bq(project_id, dataset_id, destination_table, target_file_name)
+ else:
+ logging.info(
+ f"Informational: The data file {target_file_name} was not generated because no data was available for year {year_number}. Continuing."
+ )
+ logging.info(f"Processing year {year_number} completed")
+
+
+def load_data_to_bq(
+ project_id: str, dataset_id: str, table_id: str, file_path: str
+) -> None:
+ logging.info(
+ f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} started"
+ )
+ client = bigquery.Client(project=project_id)
+ table_ref = client.dataset(dataset_id).table(table_id)
+ job_config = bigquery.LoadJobConfig()
+ job_config.source_format = bigquery.SourceFormat.CSV
+ job_config.skip_leading_rows = 1 # ignore the header
+ job_config.autodetect = False
+ with open(file_path, "rb") as source_file:
+ job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
+ job.result()
+ logging.info(
+ f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} completed"
+ )
+
+
+def create_dest_table(
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ schema_filepath: list,
+ bucket_name: str,
+) -> bool:
+ table_ref = f"{project_id}.{dataset_id}.{table_id}"
+ logging.info(f"Attempting to create table {table_ref} if it doesn't already exist")
+ client = bigquery.Client()
+ success = False
+ try:
+ table_exists_id = client.get_table(table_ref).table_id
+ logging.info(f"Table {table_exists_id} currently exists.")
+ success = True
+ except NotFound:
+ logging.info(
+ (
+ f"Table {table_ref} currently does not exist. Attempting to create table."
+ )
+ )
+ schema = create_table_schema([], bucket_name, schema_filepath)
+ table = bigquery.Table(table_ref, schema=schema)
+ client.create_table(table)
+ print(f"Table {table_ref} was created".format(table_id))
+ success = True
+ return success
+
+
+def create_table_schema(
+ schema_structure: list, bucket_name: str = "", schema_filepath: str = ""
+) -> list:
+ logging.info(f"Defining table schema... {bucket_name} ... {schema_filepath}")
+ schema = []
+ if not (schema_filepath):
+ schema_struct = schema_structure
+ else:
+ storage_client = storage.Client()
+ bucket = storage_client.get_bucket(bucket_name)
+ blob = bucket.blob(schema_filepath)
+ schema_struct = json.loads(blob.download_as_string(client=None))
+ for schema_field in schema_struct:
+ fld_name = schema_field["name"]
+ fld_type = schema_field["type"]
+ try:
+ fld_descr = schema_field["description"]
+ except KeyError:
+ fld_descr = ""
+ fld_mode = schema_field["mode"]
+ schema.append(
+ bigquery.SchemaField(
+ name=fld_name, field_type=fld_type, mode=fld_mode, description=fld_descr
+ )
+ )
+ return schema
+
+
+def process_month(
+ source_url: str,
+ year_number: int,
+ month_number: int,
+ source_file: str,
+ target_file: str,
+ target_file_name: str,
+ chunksize: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+ pipeline_name: str,
+) -> None:
+ process_month = str(year_number) + "-" + str(month_number).zfill(2)
+ logging.info(f"Processing {process_month} started")
+ source_url_to_process = f"{source_url}{process_month}.csv"
+ source_file_to_process = str(source_file).replace(".csv", f"_{process_month}.csv")
+ successful_download = download_file(source_url_to_process, source_file_to_process)
+ if successful_download:
+ with pd.read_csv(
+ source_file_to_process,
+ engine="python",
+ encoding="utf-8",
+ quotechar='"',
+ chunksize=int(chunksize),
+ sep=",",
+ names=input_headers,
+ skiprows=1,
+ dtype=data_dtypes,
+ ) as reader:
+ for chunk_number, chunk in enumerate(reader):
+ logging.info(
+ f"Processing chunk #{chunk_number} of file {process_month} started"
+ )
+ target_file_batch = str(target_file).replace(
+ ".csv", f"-{process_month}-{chunk_number}.csv"
+ )
+ df = pd.DataFrame()
+ df = pd.concat([df, chunk])
+ process_chunk(
+ df,
+ target_file_batch,
+ target_file_name,
+ month_number == 1 and chunk_number == 0,
+ month_number == 1 and chunk_number == 0,
+ output_headers,
+ pipeline_name,
+ )
+ logging.info(
+ f"Processing chunk #{chunk_number} of file {process_month} completed"
+ )
+ logging.info(f"Processing {process_month} completed")
+ return successful_download
+
+
+def download_file(source_url: str, source_file: pathlib.Path) -> bool:
+ logging.info(f"Downloading {source_url} into {source_file}")
+ success = True
+ r = requests.get(source_url, stream=True)
+ with open(source_file, "wb") as f:
+ for chunk in r:
+ f.write(chunk)
+ # if the file contains the string "NoSuchKey
" then the url returned
+ # that it could not locate the respective file
+ if open(source_file, "rb").read().find(b"NoSuchKey
") > -1:
+ success = False
+ if success:
+ logging.info(f"Download {source_url} to {source_file} complete.")
+ else:
+ logging.info(
+ f"Unable to download {source_url} to {source_file} at this time. The URL may not exist."
+ )
+ return success
+
+
+def process_chunk(
+ df: pd.DataFrame,
+ target_file_batch: str,
+ target_file: str,
+ include_header: bool,
+ truncate_file: bool,
+ output_headers: typing.List[str],
+ pipeline_name: str,
+) -> None:
+ if pipeline_name == "tlc_green_trips":
+ df["distance_between_service"] = ""
+ df["time_between_service"] = ""
+ df = format_date_time(df, "pickup_datetime", "strftime", "%Y-%m-%d %H:%M:%S")
+ df = format_date_time(df, "dropoff_datetime", "strftime", "%Y-%m-%d %H:%M:%S")
+ df = remove_null_rows(df)
+ df = df[output_headers]
+ save_to_new_file(df, file_path=str(target_file_batch))
+ append_batch_file(target_file_batch, target_file, include_header, truncate_file)
+ logging.info(f"Processing Batch {target_file_batch} completed")
+
+
+def remove_null_rows(df: pd.DataFrame) -> pd.DataFrame:
+ logging.info("Removing Null rows... ")
+ df = df.dropna(axis=0, subset=["vendor_id"])
+ return df
+
+
+def format_date_time(
+ df: pd.DataFrame, field_name: str, str_pf_time: str, dt_format: str
+) -> pd.DataFrame:
+ if str_pf_time == "strptime":
+ logging.info(
+ f"Transform: Formatting datetime for field {field_name} from datetime to {dt_format} "
+ )
+ df[field_name] = df[field_name].apply(lambda x: datetime.strptime(x, dt_format))
+ else:
+ logging.info(
+ f"Transform: Formatting datetime for field {field_name} from {dt_format} to datetime "
+ )
+ df[field_name] = df[field_name].dt.strftime(dt_format)
+ return df
+
+
+def save_to_new_file(df, file_path, sep="|") -> None:
+ logging.info(f"Saving to file {file_path} separator='{sep}'")
+ df.to_csv(file_path, sep=sep, index=False)
+
+
+def append_batch_file(
+ batch_file_path: str,
+ target_file_path: str,
+ include_header: bool,
+ truncate_target_file: bool,
+) -> None:
+ logging.info(
+ f"Appending file {batch_file_path} to file {target_file_path} with include_header={include_header} and truncate_target_file={truncate_target_file}"
+ )
+ data_file = open(batch_file_path, "r")
+ if truncate_target_file:
+ target_file = open(target_file_path, "w+").close()
+ target_file = open(target_file_path, "a+")
+ if not include_header:
+ logging.info(
+ f"Appending batch file {batch_file_path} to {target_file_path} without header"
+ )
+ next(data_file)
+ else:
+ logging.info(
+ f"Appending batch file {batch_file_path} to {target_file_path} with header"
+ )
+ target_file.write(data_file.read())
+ data_file.close()
+ target_file.close()
+ if os.path.exists(batch_file_path):
+ os.remove(batch_file_path)
+
+
+def upload_file_to_gcs(
+ file_path: pathlib.Path, target_gcs_bucket: str, target_gcs_path: str
+) -> None:
+ if os.path.exists(file_path):
+ logging.info(
+ f"Uploading output file to gs://{target_gcs_bucket}/{target_gcs_path}"
+ )
+ storage_client = storage.Client()
+ bucket = storage_client.bucket(target_gcs_bucket)
+ blob = bucket.blob(target_gcs_path)
+ blob.upload_from_filename(file_path)
+ else:
+ logging.info(
+ f"Cannot upload file to gs://{target_gcs_bucket}/{target_gcs_path} as it does not exist."
+ )
+
+
+if __name__ == "__main__":
+ logging.getLogger().setLevel(logging.INFO)
+
+ main(
+ source_url=os.environ["SOURCE_URL"],
+ source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(),
+ target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(),
+ project_id=os.environ["PROJECT_ID"],
+ dataset_id=os.environ["DATASET_ID"],
+ table_id=os.environ["TABLE_ID"],
+ schema_path=os.environ["SCHEMA_PATH"],
+ chunksize=os.environ["CHUNKSIZE"],
+ target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"],
+ target_gcs_path=os.environ["TARGET_GCS_PATH"],
+ pipeline_name=os.environ["PIPELINE_NAME"],
+ input_headers=json.loads(os.environ["INPUT_CSV_HEADERS"]),
+ data_dtypes=json.loads(os.environ["DATA_DTYPES"]),
+ output_headers=json.loads(os.environ["OUTPUT_CSV_HEADERS"]),
+ )
diff --git a/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/green_trips_schema.json b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/green_trips_schema.json
new file mode 100644
index 000000000..a2fa3775f
--- /dev/null
+++ b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/green_trips_schema.json
@@ -0,0 +1,128 @@
+[
+ {
+ "name": "vendor_id",
+ "type": "STRING",
+ "description": "A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.",
+ "mode": "REQUIRED"
+ },
+ {
+ "name": "pickup_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was disengaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "store_and_fwd_flag",
+ "type": "STRING",
+ "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward,' because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "rate_code",
+ "type": "STRING",
+ "description": "The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "passenger_count",
+ "type": "INTEGER",
+ "description": "The number of passengers in the vehicle. This is a driver-entered value.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "trip_distance",
+ "type": "NUMERIC",
+ "description": "The elapsed trip distance in miles reported by the taximeter.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "fare_amount",
+ "type": "NUMERIC",
+ "description": "The time-and-distance fare calculated by the meter",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "extra",
+ "type": "NUMERIC",
+ "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "mta_tax",
+ "type": "NUMERIC",
+ "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tip_amount",
+ "type": "NUMERIC",
+ "description": "Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tolls_amount",
+ "type": "NUMERIC",
+ "description": "Total amount of all tolls paid in trip.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "ehail_fee",
+ "type": "NUMERIC",
+ "description": "",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "total_amount",
+ "type": "NUMERIC",
+ "description": "The total amount charged to passengers. Does not include cash tips.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "payment_type",
+ "type": "string",
+ "description": "A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "distance_between_service",
+ "type": "NUMERIC",
+ "description": "",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "time_between_service",
+ "type": "INTEGER",
+ "description": "",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "trip_type",
+ "type": "STRING",
+ "description": "A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "imp_surcharge",
+ "type": "NUMERIC",
+ "description": "$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "pickup_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was disengaged",
+ "mode": "NULLABLE"
+ }
+]
diff --git a/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/requirements.txt b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/requirements.txt
new file mode 100644
index 000000000..f87f393f3
--- /dev/null
+++ b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/requirements.txt
@@ -0,0 +1,4 @@
+google-cloud-storage
+google-cloud-bigquery
+pandas
+requests
diff --git a/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/yellow_trips_schema.json b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/yellow_trips_schema.json
new file mode 100644
index 000000000..aea390c5f
--- /dev/null
+++ b/datasets/new_york_taxi_trips/_images/run_csv_transform_kub/yellow_trips_schema.json
@@ -0,0 +1,104 @@
+[
+ {
+ "name": "vendor_id",
+ "type": "STRING",
+ "description": "A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.",
+ "mode": "REQUIRED"
+ },
+ {
+ "name": "pickup_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was disengaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "passenger_count",
+ "type": "INTEGER",
+ "description": "The number of passengers in the vehicle. This is a driver-entered value.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "trip_distance",
+ "type": "NUMERIC",
+ "description": "The elapsed trip distance in miles reported by the taximeter.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "rate_code",
+ "type": "STRING",
+ "description": "The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "store_and_fwd_flag",
+ "type": "STRING",
+ "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward,' because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "payment_type",
+ "type": "string",
+ "description": "A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "fare_amount",
+ "type": "NUMERIC",
+ "description": "The time-and-distance fare calculated by the meter",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "extra",
+ "type": "NUMERIC",
+ "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "mta_tax",
+ "type": "NUMERIC",
+ "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tip_amount",
+ "type": "NUMERIC",
+ "description": "Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tolls_amount",
+ "type": "NUMERIC",
+ "description": "Total amount of all tolls paid in trip.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "imp_surcharge",
+ "type": "NUMERIC",
+ "description": "$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "total_amount",
+ "type": "NUMERIC",
+ "description": "The total amount charged to passengers. Does not include cash tips.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "pickup_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was disengaged",
+ "mode": "NULLABLE"
+ }
+]
diff --git a/datasets/new_york_taxi_trips/infra/new_york_taxi_trips_dataset.tf b/datasets/new_york_taxi_trips/infra/new_york_taxi_trips_dataset.tf
new file mode 100644
index 000000000..890312890
--- /dev/null
+++ b/datasets/new_york_taxi_trips/infra/new_york_taxi_trips_dataset.tf
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_dataset" "new_york_taxi_trips" {
+ dataset_id = "new_york_taxi_trips"
+ project = var.project_id
+}
+
+output "bigquery_dataset-new_york_taxi_trips-dataset_id" {
+ value = google_bigquery_dataset.new_york_taxi_trips.dataset_id
+}
diff --git a/datasets/new_york_taxi_trips/infra/new_york_taxi_trips_pipeline.tf b/datasets/new_york_taxi_trips/infra/new_york_taxi_trips_pipeline.tf
new file mode 100644
index 000000000..cf18c92d0
--- /dev/null
+++ b/datasets/new_york_taxi_trips/infra/new_york_taxi_trips_pipeline.tf
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_table" "new_york_taxi_trips_tlc_green_trips" {
+ project = var.project_id
+ dataset_id = "new_york_taxi_trips"
+ table_id = "tlc_green_trips"
+ description = "New York green taxi trips table"
+ depends_on = [
+ google_bigquery_dataset.new_york_taxi_trips
+ ]
+}
+
+output "bigquery_table-new_york_taxi_trips_tlc_green_trips-table_id" {
+ value = google_bigquery_table.new_york_taxi_trips_tlc_green_trips.table_id
+}
+
+output "bigquery_table-new_york_taxi_trips_tlc_green_trips-id" {
+ value = google_bigquery_table.new_york_taxi_trips_tlc_green_trips.id
+}
+
+resource "google_bigquery_table" "new_york_taxi_trips_tlc_yellow_trips" {
+ project = var.project_id
+ dataset_id = "new_york_taxi_trips"
+ table_id = "tlc_yellow_trips"
+ description = "New York yellow taxi trips table"
+ depends_on = [
+ google_bigquery_dataset.new_york_taxi_trips
+ ]
+}
+
+output "bigquery_table-new_york_taxi_trips_tlc_yellow_trips-table_id" {
+ value = google_bigquery_table.new_york_taxi_trips_tlc_yellow_trips.table_id
+}
+
+output "bigquery_table-new_york_taxi_trips_tlc_yellow_trips-id" {
+ value = google_bigquery_table.new_york_taxi_trips_tlc_yellow_trips.id
+}
diff --git a/datasets/new_york_taxi_trips/infra/provider.tf b/datasets/new_york_taxi_trips/infra/provider.tf
new file mode 100644
index 000000000..23ab87dcd
--- /dev/null
+++ b/datasets/new_york_taxi_trips/infra/provider.tf
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+provider "google" {
+ project = var.project_id
+ impersonate_service_account = var.impersonating_acct
+ region = var.region
+}
+
+data "google_client_openid_userinfo" "me" {}
+
+output "impersonating-account" {
+ value = data.google_client_openid_userinfo.me.email
+}
diff --git a/datasets/new_york_taxi_trips/infra/variables.tf b/datasets/new_york_taxi_trips/infra/variables.tf
new file mode 100644
index 000000000..53f483735
--- /dev/null
+++ b/datasets/new_york_taxi_trips/infra/variables.tf
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+variable "project_id" {}
+variable "bucket_name_prefix" {}
+variable "impersonating_acct" {}
+variable "region" {}
+variable "env" {}
+variable "iam_policies" {
+ default = {}
+}
+
diff --git a/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/Dockerfile b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/Dockerfile
new file mode 100644
index 000000000..7265a1b71
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/Dockerfile
@@ -0,0 +1,37 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base image for this build
+FROM python:3.8
+
+# Allow statements and log messages to appear in Cloud logs
+ENV PYTHONUNBUFFERED True
+
+# Copy the requirements file into the image
+COPY requirements.txt ./
+
+# Install the packages specified in the requirements file
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+
+# The WORKDIR instruction sets the working directory for any RUN, CMD,
+# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
+# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
+# any subsequent Dockerfile instruction
+WORKDIR /custom
+
+# Copy the specific data processing script/s in the image under /custom/*
+COPY ./csv_transform.py .
+
+# Command to run the data processing script when the container is run
+CMD ["python3", "csv_transform.py"]
diff --git a/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/csv_transform.py b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/csv_transform.py
new file mode 100644
index 000000000..d4e1358c4
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/csv_transform.py
@@ -0,0 +1,536 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import pathlib
+import typing
+from datetime import datetime
+
+import pandas as pd
+import requests
+from google.cloud import bigquery, storage
+from google.cloud.exceptions import NotFound
+
+
+def main(
+ source_url: str,
+ source_file: pathlib.Path,
+ target_file: pathlib.Path,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ data_file_year_field: str,
+ data_file_month_field: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ pipeline_name: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+) -> None:
+ logging.info(f"New York taxi trips - {pipeline_name} process started")
+ pathlib.Path("./files").mkdir(parents=True, exist_ok=True)
+ execute_pipeline(
+ source_url,
+ str(source_file),
+ str(target_file),
+ project_id,
+ dataset_id,
+ table_id,
+ data_file_year_field,
+ data_file_month_field,
+ schema_path,
+ chunksize,
+ target_gcs_bucket,
+ target_gcs_path,
+ pipeline_name,
+ input_headers,
+ data_dtypes,
+ output_headers,
+ )
+ logging.info(f"New York taxi trips - {pipeline_name} process completed")
+
+
+def execute_pipeline(
+ source_url: str,
+ source_file: str,
+ target_file: str,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ data_file_year_field: str,
+ data_file_month_field: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ pipeline_name: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+) -> None:
+ for year_number in range(datetime.now().year, (datetime.now().year - 6), -1):
+ process_year_data(
+ source_url=source_url,
+ year_number=int(year_number),
+ source_file=source_file,
+ target_file=target_file,
+ project_id=project_id,
+ dataset_id=dataset_id,
+ table_id=table_id,
+ data_file_year_field=data_file_year_field,
+ data_file_month_field=data_file_month_field,
+ schema_path=schema_path,
+ chunksize=chunksize,
+ target_gcs_bucket=target_gcs_bucket,
+ target_gcs_path=target_gcs_path,
+ pipeline_name=pipeline_name,
+ input_headers=input_headers,
+ data_dtypes=data_dtypes,
+ output_headers=output_headers,
+ )
+
+
+def process_year_data(
+ source_url: str,
+ year_number: int,
+ source_file: str,
+ target_file: str,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ data_file_year_field: str,
+ data_file_month_field: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ pipeline_name: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+) -> None:
+ logging.info(f"Processing year {year_number}")
+ destination_table = f"{table_id}_{year_number}"
+ for month_number in range(1, 13):
+ padded_month = str(month_number).zfill(2)
+ process_year_month = f"{year_number}-{padded_month}"
+ logging.info(f"Processing month {process_year_month}")
+ month_data_already_loaded = table_has_month_data(
+ project_id=project_id,
+ dataset_id=dataset_id,
+ table_name=destination_table,
+ data_file_year_field=data_file_year_field,
+ year_number=year_number,
+ data_file_month_field=data_file_month_field,
+ month_number=month_number,
+ )
+ if month_data_already_loaded:
+ logging.info(f"{process_year_month} data is already loaded. Skipping.")
+ else:
+ target_file_name = str.replace(
+ target_file, ".csv", f"_{year_number}-{month_number}.csv"
+ )
+ process_month(
+ source_url=source_url,
+ year_number=year_number,
+ month_number=month_number,
+ source_file=source_file,
+ target_file=target_file,
+ project_id=project_id,
+ dataset_id=dataset_id,
+ table_id=destination_table,
+ target_file_name=target_file_name,
+ schema_path=schema_path,
+ chunksize=chunksize,
+ target_gcs_bucket=target_gcs_bucket,
+ target_gcs_path=target_gcs_path,
+ input_headers=input_headers,
+ data_dtypes=data_dtypes,
+ output_headers=output_headers,
+ pipeline_name=pipeline_name,
+ )
+ logging.info(f"Processing year {year_number} completed")
+
+
+def table_has_month_data(
+ project_id: str,
+ dataset_id: str,
+ table_name: str,
+ data_file_year_field: str,
+ year_number: int,
+ data_file_month_field: str,
+ month_number: int,
+) -> bool:
+ check_field_exists = field_exists(
+ project_id, dataset_id, table_name, data_file_month_field
+ )
+ if check_field_exists:
+ client = bigquery.Client(project=project_id)
+ query = f"""
+ SELECT count(1) AS number_of_rows
+ FROM {dataset_id}.{table_name}
+ WHERE {data_file_year_field} = {year_number}
+ AND {data_file_month_field} = {month_number}
+ """
+ job_config = bigquery.QueryJobConfig()
+ query_job = client.query(query, job_config=job_config)
+ for row in query_job.result():
+ count_rows = row.number_of_rows
+ if int(count_rows) > 0:
+ return True
+ else:
+ return False
+ else:
+ return None
+
+
+def table_exists(project_id: str, dataset_id: str, table_name: str) -> bool:
+ client = bigquery.Client(project=project_id)
+ tables = client.list_tables(dataset_id)
+ found_table = False
+ for tbl in tables:
+ if tbl.table_id == table_name:
+ found_table = True
+ return found_table
+
+
+def field_exists(
+ project_id: str, dataset_id: str, table_name: str, field_name: str
+) -> bool:
+ if table_exists(project_id, dataset_id, table_name):
+ client = bigquery.Client(project=project_id)
+ table_ref = f"{dataset_id}.{table_name}"
+ tbl_schema = client.get_table(table_ref).schema
+ found_field = False
+ for field in tbl_schema:
+ if field.name == field_name:
+ found_field = True
+ return found_field
+ else:
+ return False
+
+
+def load_data_to_bq(
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ file_path: str,
+ field_delimiter: str,
+) -> None:
+ logging.info(
+ f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} started"
+ )
+ client = bigquery.Client(project=project_id)
+ table_ref = client.dataset(dataset_id).table(table_id)
+ job_config = bigquery.LoadJobConfig()
+ job_config.source_format = bigquery.SourceFormat.CSV
+ job_config.field_delimiter = field_delimiter
+ job_config.skip_leading_rows = 1 # ignore the header
+ job_config.autodetect = False
+ with open(file_path, "rb") as source_file:
+ job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
+ job.result()
+ logging.info(
+ f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} completed"
+ )
+
+
+def create_dest_table(
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ schema_filepath: list,
+ bucket_name: str,
+) -> None:
+ table_ref = f"{project_id}.{dataset_id}.{table_id}"
+ logging.info(f"Attempting to create table {table_ref} if it doesn't already exist")
+ client = bigquery.Client()
+ try:
+ table_exists_id = client.get_table(table_ref).table_id
+ logging.info(f"Table {table_exists_id} currently exists.")
+ except NotFound:
+ logging.info(
+ (
+ f"Table {table_ref} currently does not exist. Attempting to create table."
+ )
+ )
+ schema = create_table_schema([], bucket_name, schema_filepath)
+ table = bigquery.Table(table_ref, schema=schema)
+ client.create_table(table)
+ print(f"Table {table_ref} was created".format(table_id))
+
+
+def create_table_schema(
+ schema_structure: list, bucket_name: str = "", schema_filepath: str = ""
+) -> list:
+ logging.info(f"Defining table schema... {bucket_name} ... {schema_filepath}")
+ schema = []
+ if not (schema_filepath):
+ schema_struct = schema_structure
+ else:
+ storage_client = storage.Client()
+ bucket = storage_client.get_bucket(bucket_name)
+ blob = bucket.blob(schema_filepath)
+ schema_struct = json.loads(blob.download_as_string(client=None))
+ for schema_field in schema_struct:
+ fld_name = schema_field["name"]
+ fld_type = schema_field["type"]
+ try:
+ fld_descr = schema_field["description"]
+ except KeyError:
+ fld_descr = ""
+ fld_mode = schema_field["mode"]
+ schema.append(
+ bigquery.SchemaField(
+ name=fld_name, field_type=fld_type, mode=fld_mode, description=fld_descr
+ )
+ )
+ return schema
+
+
+def process_month(
+ source_url: str,
+ year_number: int,
+ month_number: int,
+ source_file: str,
+ target_file: str,
+ project_id: str,
+ dataset_id: str,
+ table_id: str,
+ target_file_name: str,
+ schema_path: str,
+ chunksize: str,
+ target_gcs_bucket: str,
+ target_gcs_path: str,
+ input_headers: typing.List[str],
+ data_dtypes: dict,
+ output_headers: typing.List[str],
+ pipeline_name: str,
+) -> None:
+ padded_month = str(month_number).zfill(2)
+ process_year_month = f"{year_number}-{padded_month}"
+ source_url_to_process = f"{source_url}{process_year_month}.csv"
+ source_file_to_process = str(source_file).replace(
+ ".csv", f"_{process_year_month}.csv"
+ )
+ successful_download = download_file(source_url_to_process, source_file_to_process)
+ if successful_download:
+ with pd.read_csv(
+ source_file_to_process,
+ engine="python",
+ encoding="utf-8",
+ quotechar='"',
+ chunksize=int(chunksize),
+ sep=",",
+ names=input_headers,
+ skiprows=1,
+ dtype=data_dtypes,
+ ) as reader:
+ for chunk_number, chunk in enumerate(reader):
+ logging.info(
+ f"Processing chunk #{chunk_number} of file {process_year_month} started"
+ )
+ target_file_batch = str(target_file).replace(
+ ".csv", f"-{process_year_month}-{chunk_number}.csv"
+ )
+ df = pd.DataFrame()
+ df = pd.concat([df, chunk])
+ process_chunk(
+ df,
+ target_file_batch,
+ target_file_name,
+ month_number == 1 and chunk_number == 0,
+ month_number == 1 and chunk_number == 0,
+ output_headers,
+ pipeline_name,
+ year_number,
+ month_number,
+ )
+ logging.info(
+ f"Processing chunk #{chunk_number} of file {process_year_month} completed"
+ )
+ if not table_exists(project_id, dataset_id, table_id):
+ # Destination able doesn't exist
+ create_dest_table(
+ project_id=project_id,
+ dataset_id=dataset_id,
+ table_id=table_id,
+ schema_filepath=schema_path,
+ bucket_name=target_gcs_bucket,
+ )
+ load_data_to_bq(
+ project_id=project_id,
+ dataset_id=dataset_id,
+ table_id=table_id,
+ file_path=target_file_name,
+ field_delimiter="|",
+ )
+ upload_file_to_gcs(
+ file_path=target_file_name,
+ target_gcs_bucket=target_gcs_bucket,
+ target_gcs_path=str(target_gcs_path).replace(
+ ".csv", f"_{process_year_month}.csv"
+ ),
+ )
+ else:
+ logging.info(
+ f"Informational: The data file {target_file_name} was not generated because no data was available for year {year_number}. Continuing."
+ )
+ logging.info(f"Processing {process_year_month} completed")
+
+
+def download_file(source_url: str, source_file: pathlib.Path) -> bool:
+ logging.info(f"Downloading {source_url} into {source_file}")
+ success = True
+ r = requests.get(source_url, stream=True)
+ with open(source_file, "wb") as f:
+ for chunk in r:
+ f.write(chunk)
+ # if the file contains the string "NoSuchKey
" then the url returned
+ # that it could not locate the respective file
+ if open(source_file, "rb").read().find(b"NoSuchKey
") > -1:
+ success = False
+ if success:
+ logging.info(f"Download {source_url} to {source_file} complete.")
+ else:
+ logging.info(
+ f"Unable to download {source_url} to {source_file} at this time. The URL may not exist."
+ )
+ return success
+
+
+def process_chunk(
+ df: pd.DataFrame,
+ target_file_batch: str,
+ target_file: str,
+ include_header: bool,
+ truncate_file: bool,
+ output_headers: typing.List[str],
+ pipeline_name: str,
+ year_number: int,
+ month_number: int,
+) -> None:
+ if pipeline_name == "tlc_green_trips":
+ df["distance_between_service"] = ""
+ df["time_between_service"] = ""
+ df["data_file_year"] = year_number
+ df["data_file_month"] = month_number
+ df = format_date_time(df, "pickup_datetime", "strftime", "%Y-%m-%d %H:%M:%S")
+ df = format_date_time(df, "dropoff_datetime", "strftime", "%Y-%m-%d %H:%M:%S")
+ df = remove_null_rows(df)
+ df = df[output_headers]
+ save_to_new_file(df, file_path=str(target_file_batch))
+ append_batch_file(target_file_batch, target_file, include_header, truncate_file)
+ logging.info(f"Processing Batch {target_file_batch} completed")
+
+
+def remove_null_rows(df: pd.DataFrame) -> pd.DataFrame:
+ logging.info("Removing Null rows... ")
+ df = df.dropna(axis=0, subset=["vendor_id"])
+ return df
+
+
+def format_date_time(
+ df: pd.DataFrame, field_name: str, str_pf_time: str, dt_format: str
+) -> pd.DataFrame:
+ if str_pf_time == "strptime":
+ logging.info(
+ f"Transform: Formatting datetime for field {field_name} from datetime to {dt_format} "
+ )
+ df[field_name] = df[field_name].apply(lambda x: datetime.strptime(x, dt_format))
+ else:
+ logging.info(
+ f"Transform: Formatting datetime for field {field_name} from {dt_format} to datetime "
+ )
+ df[field_name] = df[field_name].dt.strftime(dt_format)
+ return df
+
+
+def save_to_new_file(df, file_path, sep="|") -> None:
+ logging.info(f"Saving to file {file_path} separator='{sep}'")
+ df.to_csv(file_path, sep=sep, index=False)
+
+
+def append_batch_file(
+ batch_file_path: str,
+ target_file_path: str,
+ include_header: bool,
+ truncate_target_file: bool,
+) -> None:
+ logging.info(
+ f"Appending file {batch_file_path} to file {target_file_path} with include_header={include_header} and truncate_target_file={truncate_target_file}"
+ )
+ data_file = open(batch_file_path, "r")
+ if truncate_target_file:
+ target_file = open(target_file_path, "w+").close()
+ target_file = open(target_file_path, "a+")
+ if not include_header:
+ logging.info(
+ f"Appending batch file {batch_file_path} to {target_file_path} without header"
+ )
+ next(data_file)
+ else:
+ logging.info(
+ f"Appending batch file {batch_file_path} to {target_file_path} with header"
+ )
+ target_file.write(data_file.read())
+ data_file.close()
+ target_file.close()
+ if os.path.exists(batch_file_path):
+ os.remove(batch_file_path)
+
+
+def upload_file_to_gcs(
+ file_path: pathlib.Path, target_gcs_bucket: str, target_gcs_path: str
+) -> None:
+ if os.path.exists(file_path):
+ logging.info(
+ f"Uploading output file {file_path} to gs://{target_gcs_bucket}/{target_gcs_path}"
+ )
+ storage_client = storage.Client()
+ bucket = storage_client.bucket(target_gcs_bucket)
+ blob = bucket.blob(target_gcs_path)
+ blob.upload_from_filename(file_path)
+ else:
+ logging.info(
+ f"Cannot upload file {file_path} to gs://{target_gcs_bucket}/{target_gcs_path} as it does not exist."
+ )
+
+
+if __name__ == "__main__":
+ logging.getLogger().setLevel(logging.INFO)
+
+ main(
+ source_url=os.environ["SOURCE_URL"],
+ source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(),
+ target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(),
+ project_id=os.environ["PROJECT_ID"],
+ dataset_id=os.environ["DATASET_ID"],
+ table_id=os.environ["TABLE_ID"],
+ data_file_year_field=os.environ["DATA_FILE_YEAR_FIELD"],
+ data_file_month_field=os.environ["DATA_FILE_MONTH_FIELD"],
+ schema_path=os.environ["SCHEMA_PATH"],
+ chunksize=os.environ["CHUNKSIZE"],
+ target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"],
+ target_gcs_path=os.environ["TARGET_GCS_PATH"],
+ pipeline_name=os.environ["PIPELINE_NAME"],
+ input_headers=json.loads(os.environ["INPUT_CSV_HEADERS"]),
+ data_dtypes=json.loads(os.environ["DATA_DTYPES"]),
+ output_headers=json.loads(os.environ["OUTPUT_CSV_HEADERS"]),
+ )
diff --git a/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/green_trips_schema.json b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/green_trips_schema.json
new file mode 100644
index 000000000..562864dac
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/green_trips_schema.json
@@ -0,0 +1,140 @@
+[
+ {
+ "name": "vendor_id",
+ "type": "STRING",
+ "description": "A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.",
+ "mode": "REQUIRED"
+ },
+ {
+ "name": "pickup_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was disengaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "store_and_fwd_flag",
+ "type": "STRING",
+ "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward,' because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "rate_code",
+ "type": "STRING",
+ "description": "The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "passenger_count",
+ "type": "INTEGER",
+ "description": "The number of passengers in the vehicle. This is a driver-entered value.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "trip_distance",
+ "type": "NUMERIC",
+ "description": "The elapsed trip distance in miles reported by the taximeter.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "fare_amount",
+ "type": "NUMERIC",
+ "description": "The time-and-distance fare calculated by the meter",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "extra",
+ "type": "NUMERIC",
+ "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "mta_tax",
+ "type": "NUMERIC",
+ "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tip_amount",
+ "type": "NUMERIC",
+ "description": "Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tolls_amount",
+ "type": "NUMERIC",
+ "description": "Total amount of all tolls paid in trip.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "ehail_fee",
+ "type": "NUMERIC",
+ "description": "",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "total_amount",
+ "type": "NUMERIC",
+ "description": "The total amount charged to passengers. Does not include cash tips.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "payment_type",
+ "type": "string",
+ "description": "A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "distance_between_service",
+ "type": "NUMERIC",
+ "description": "",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "time_between_service",
+ "type": "INTEGER",
+ "description": "",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "trip_type",
+ "type": "STRING",
+ "description": "A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver. 1= Street-hail 2= Dispatch",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "imp_surcharge",
+ "type": "NUMERIC",
+ "description": "$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "pickup_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was disengaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "data_file_year",
+ "type": "INTEGER",
+ "description": "Datafile timestamp year value",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "data_file_month",
+ "type": "INTEGER",
+ "description": "Datafile timestamp month value",
+ "mode": "NULLABLE"
+ }
+]
diff --git a/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/requirements.txt b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/requirements.txt
new file mode 100644
index 000000000..f87f393f3
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/requirements.txt
@@ -0,0 +1,4 @@
+google-cloud-storage
+google-cloud-bigquery
+pandas
+requests
diff --git a/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/yellow_trips_schema.json b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/yellow_trips_schema.json
new file mode 100644
index 000000000..da5466cba
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/_images/run_csv_transform_kub/yellow_trips_schema.json
@@ -0,0 +1,116 @@
+[
+ {
+ "name": "vendor_id",
+ "type": "STRING",
+ "description": "A code indicating the LPEP provider that provided the record. 1= Creative Mobile Technologies, LLC; 2= VeriFone Inc.",
+ "mode": "REQUIRED"
+ },
+ {
+ "name": "pickup_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_datetime",
+ "type": "TIMESTAMP",
+ "description": "The date and time when the meter was disengaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "passenger_count",
+ "type": "INTEGER",
+ "description": "The number of passengers in the vehicle. This is a driver-entered value.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "trip_distance",
+ "type": "NUMERIC",
+ "description": "The elapsed trip distance in miles reported by the taximeter.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "rate_code",
+ "type": "STRING",
+ "description": "The final rate code in effect at the end of the trip. 1= Standard rate 2=JFK 3=Newark 4=Nassau or Westchester 5=Negotiated fare 6=Group ride",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "store_and_fwd_flag",
+ "type": "STRING",
+ "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward,' because the vehicle did not have a connection to the server. Y= store and forward trip N= not a store and forward trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "payment_type",
+ "type": "string",
+ "description": "A numeric code signifying how the passenger paid for the trip. 1= Credit card 2= Cash 3= No charge 4= Dispute 5= Unknown 6= Voided trip",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "fare_amount",
+ "type": "NUMERIC",
+ "description": "The time-and-distance fare calculated by the meter",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "extra",
+ "type": "NUMERIC",
+ "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "mta_tax",
+ "type": "NUMERIC",
+ "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tip_amount",
+ "type": "NUMERIC",
+ "description": "Tip amount. This field is automatically populated for credit card tips. Cash tips are not included.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "tolls_amount",
+ "type": "NUMERIC",
+ "description": "Total amount of all tolls paid in trip.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "imp_surcharge",
+ "type": "NUMERIC",
+ "description": "$0.30 improvement surcharge assessed on hailed trips at the flag drop. The improvement surcharge began being levied in 2015.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "total_amount",
+ "type": "NUMERIC",
+ "description": "The total amount charged to passengers. Does not include cash tips.",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "pickup_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was engaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "dropoff_location_id",
+ "type": "STRING",
+ "description": "TLC Taxi Zone in which the taximeter was disengaged",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "data_file_year",
+ "type": "INTEGER",
+ "description": "Datafile timestamp year value",
+ "mode": "NULLABLE"
+ },
+ {
+ "name": "data_file_month",
+ "type": "INTEGER",
+ "description": "Datafile timestamp month value",
+ "mode": "NULLABLE"
+ }
+]
diff --git a/datasets/new_york_taxi_trips/pipelines/dataset.yaml b/datasets/new_york_taxi_trips/pipelines/dataset.yaml
new file mode 100644
index 000000000..46da9892e
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/dataset.yaml
@@ -0,0 +1,25 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dataset:
+ name: new_york_taxi_trips
+ friendly_name: ~
+ description: ~
+ dataset_sources: ~
+ terms_of_use: ~
+
+resources:
+ - type: bigquery_dataset
+ dataset_id: new_york_taxi_trips
+ description: ~
diff --git a/datasets/new_york_taxi_trips/pipelines/new_york_taxi_trips/new_york_taxi_trips_dag.py b/datasets/new_york_taxi_trips/pipelines/new_york_taxi_trips/new_york_taxi_trips_dag.py
new file mode 100644
index 000000000..500ac9e96
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/new_york_taxi_trips/new_york_taxi_trips_dag.py
@@ -0,0 +1,125 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from airflow import DAG
+from airflow.providers.google.cloud.operators import kubernetes_engine
+
+default_args = {
+ "owner": "Google",
+ "depends_on_past": False,
+ "start_date": "2021-03-01",
+}
+
+
+with DAG(
+ dag_id="new_york_taxi_trips.new_york_taxi_trips",
+ default_args=default_args,
+ max_active_runs=1,
+ schedule_interval="@daily",
+ catchup=False,
+ default_view="graph",
+) as dag:
+ create_cluster = kubernetes_engine.GKECreateClusterOperator(
+ task_id="create_cluster",
+ project_id="{{ var.value.gcp_project }}",
+ location="us-central1-c",
+ body={
+ "name": "new-york-taxi-trips",
+ "initial_node_count": 2,
+ "network": "{{ var.value.vpc_network }}",
+ "node_config": {
+ "machine_type": "e2-standard-4",
+ "oauth_scopes": [
+ "https://www.googleapis.com/auth/devstorage.read_write",
+ "https://www.googleapis.com/auth/cloud-platform",
+ ],
+ },
+ },
+ )
+
+ # Run CSV transform within kubernetes pod
+ green_trips = kubernetes_engine.GKEStartPodOperator(
+ task_id="green_trips",
+ startup_timeout_seconds=600,
+ name="load_tlc_green_trips",
+ namespace="default",
+ project_id="{{ var.value.gcp_project }}",
+ location="us-central1-c",
+ cluster_name="new-york-taxi-trips",
+ image_pull_policy="Always",
+ image="{{ var.json.new_york_taxi_trips.container_registry.run_csv_transform_kub }}",
+ env_vars={
+ "SOURCE_URL": "{{ var.json.new_york_taxi_trips.container_registry.green_trips_source_url }}",
+ "SOURCE_FILE": "files/data_green_trips.csv",
+ "TARGET_FILE": "files/data_output_green_trips.csv",
+ "PROJECT_ID": "{{ var.value.gcp_project }}",
+ "DATASET_ID": "{{ var.json.new_york_taxi_trips.container_registry.green_trips_dataset_id }}",
+ "TABLE_ID": "{{ var.json.new_york_taxi_trips.container_registry.green_trips_table_id }}",
+ "DATA_FILE_YEAR_FIELD": "data_file_year",
+ "DATA_FILE_MONTH_FIELD": "data_file_month",
+ "SCHEMA_PATH": "{{ var.json.new_york_taxi_trips.container_registry.green_trips_schema_path }}",
+ "CHUNKSIZE": "{{ var.json.new_york_taxi_trips.container_registry.green_trips_chunk_size }}",
+ "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}",
+ "TARGET_GCS_PATH": "{{ var.json.new_york_taxi_trips.container_registry.green_trips_target_gcs_path }}",
+ "PIPELINE_NAME": "tlc_green_trips",
+ "INPUT_CSV_HEADERS": '["vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code",\n "pickup_location_id", "dropoff_location_id", "passenger_count", "trip_distance", "fare_amount",\n "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee",\n "imp_surcharge", "total_amount", "payment_type", "trip_type", "congestion_surcharge" ]',
+ "DATA_DTYPES": '{ "vendor_id": "str",\n "pickup_datetime": "datetime64[ns]",\n "dropoff_datetime": "datetime64[ns]",\n "store_and_fwd_flag": "str",\n "rate_code": "str",\n "pickup_location_id": "str",\n "dropoff_location_id": "str",\n "passenger_count": "str",\n "trip_distance": "float64",\n "fare_amount": "float64",\n "extra": "float64",\n "mta_tax": "float64",\n "tip_amount": "float64",\n "tolls_amount": "float64",\n "ehail_fee": "float64",\n "imp_surcharge": "float64",\n "total_amount": "float64",\n "payment_type": "str",\n "trip_type": "str",\n "congestion_surcharge": "float64" }',
+ "OUTPUT_CSV_HEADERS": '[ "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code",\n "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax",\n "tip_amount", "tolls_amount", "ehail_fee", "total_amount", "payment_type",\n "distance_between_service", "time_between_service", "trip_type", "imp_surcharge", "pickup_location_id",\n "dropoff_location_id", "data_file_year", "data_file_month" ]',
+ },
+ resources={
+ "request_memory": "12G",
+ "request_cpu": "1",
+ "request_ephemeral_storage": "16G",
+ },
+ )
+
+ # Run CSV transform within kubernetes pod
+ yellow_trips = kubernetes_engine.GKEStartPodOperator(
+ task_id="yellow_trips",
+ startup_timeout_seconds=600,
+ name="load_tlc_yellow_trips",
+ namespace="default",
+ project_id="{{ var.value.gcp_project }}",
+ location="us-central1-c",
+ cluster_name="new-york-taxi-trips",
+ image_pull_policy="Always",
+ image="{{ var.json.new_york_taxi_trips.container_registry.run_csv_transform_kub }}",
+ env_vars={
+ "SOURCE_URL": "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_source_url }}",
+ "SOURCE_FILE": "files/data_yellow_trips.csv",
+ "TARGET_FILE": "files/data_output_yellow_trips.csv",
+ "DATA_FILE_YEAR_FIELD": "data_file_year",
+ "DATA_FILE_MONTH_FIELD": "data_file_month",
+ "PROJECT_ID": "{{ var.value.gcp_project }}",
+ "DATASET_ID": "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_dataset_id }}",
+ "TABLE_ID": "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_table_id }}",
+ "SCHEMA_PATH": "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_schema_path }}",
+ "CHUNKSIZE": "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_chunk_size }}",
+ "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}",
+ "TARGET_GCS_PATH": "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_target_gcs_path }}",
+ "PIPELINE_NAME": "tlc_yellow_trips",
+ "INPUT_CSV_HEADERS": '[ "vendor_id", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance",\n "rate_code", "store_and_fwd_flag", "pickup_location_id", "dropoff_location_id",\n "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount",\n "tolls_amount", "imp_surcharge", "total_amount", "congestion_surcharge" ]',
+ "DATA_DTYPES": '{ "vendor_id": "str",\n "pickup_datetime": "datetime64[ns]",\n "dropoff_datetime": "datetime64[ns]",\n "passenger_count": "str",\n "trip_distance": "float64",\n "rate_code": "str",\n "store_and_fwd_flag": "str",\n "pickup_location_id": "str",\n "dropoff_location_id": "str",\n "payment_type": "str",\n "fare_amount": "float64",\n "extra": "float64",\n "mta_tax": "float64",\n "tip_amount": "float64",\n "tolls_amount": "float64",\n "imp_surcharge": "float64",\n "total_amount": "float64",\n "congestion_surcharge": "float64" }',
+ "OUTPUT_CSV_HEADERS": '[ "vendor_id", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance",\n "rate_code", "store_and_fwd_flag", "payment_type", "fare_amount", "extra",\n "mta_tax", "tip_amount", "tolls_amount", "imp_surcharge", "total_amount",\n "pickup_location_id", "dropoff_location_id", "data_file_year", "data_file_month" ]',
+ },
+ )
+ delete_cluster = kubernetes_engine.GKEDeleteClusterOperator(
+ task_id="delete_cluster",
+ project_id="{{ var.value.gcp_project }}",
+ location="us-central1-c",
+ name="new-york-taxi-trips",
+ )
+
+ create_cluster >> [green_trips, yellow_trips] >> delete_cluster
diff --git a/datasets/new_york_taxi_trips/pipelines/new_york_taxi_trips/pipeline.yaml b/datasets/new_york_taxi_trips/pipelines/new_york_taxi_trips/pipeline.yaml
new file mode 100644
index 000000000..4ee775731
--- /dev/null
+++ b/datasets/new_york_taxi_trips/pipelines/new_york_taxi_trips/pipeline.yaml
@@ -0,0 +1,175 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+resources:
+ - type: bigquery_table
+ table_id: "tlc_green_trips"
+ description: "New York green taxi trips table"
+ - type: bigquery_table
+ table_id: "tlc_yellow_trips"
+ description: "New York yellow taxi trips table"
+
+dag:
+ airflow_version: 2
+ initialize:
+ dag_id: new_york_taxi_trips
+ default_args:
+ owner: "Google"
+ depends_on_past: False
+ start_date: "2021-03-01"
+ max_active_runs: 1
+ schedule_interval: "@daily"
+ catchup: False
+ default_view: graph
+ tasks:
+ - operator: "GKECreateClusterOperator"
+ args:
+ task_id: "create_cluster"
+ project_id: "{{ var.value.gcp_project }}"
+ location: "us-central1-c"
+ body:
+ name: new-york-taxi-trips
+ initial_node_count: 2
+ network: "{{ var.value.vpc_network }}"
+ node_config:
+ machine_type: e2-standard-4
+ oauth_scopes:
+ - https://www.googleapis.com/auth/devstorage.read_write
+ - https://www.googleapis.com/auth/cloud-platform
+ - operator: "GKEStartPodOperator"
+ description: "Run CSV transform within kubernetes pod"
+ args:
+ task_id: "green_trips"
+ startup_timeout_seconds: 600
+ name: "load_tlc_green_trips"
+ namespace: "default"
+ project_id: "{{ var.value.gcp_project }}"
+ location: "us-central1-c"
+ cluster_name: new-york-taxi-trips
+ image_pull_policy: "Always"
+ image: "{{ var.json.new_york_taxi_trips.container_registry.run_csv_transform_kub }}"
+ env_vars:
+ SOURCE_URL: "{{ var.json.new_york_taxi_trips.container_registry.green_trips_source_url }}"
+ SOURCE_FILE: "files/data_green_trips.csv"
+ TARGET_FILE: "files/data_output_green_trips.csv"
+ PROJECT_ID: "{{ var.value.gcp_project }}"
+ DATASET_ID: "{{ var.json.new_york_taxi_trips.container_registry.green_trips_dataset_id }}"
+ TABLE_ID: "{{ var.json.new_york_taxi_trips.container_registry.green_trips_table_id }}"
+ DATA_FILE_YEAR_FIELD: "data_file_year"
+ DATA_FILE_MONTH_FIELD: "data_file_month"
+ SCHEMA_PATH: "{{ var.json.new_york_taxi_trips.container_registry.green_trips_schema_path }}"
+ CHUNKSIZE: "{{ var.json.new_york_taxi_trips.container_registry.green_trips_chunk_size }}"
+ TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}"
+ TARGET_GCS_PATH: "{{ var.json.new_york_taxi_trips.container_registry.green_trips_target_gcs_path }}"
+ PIPELINE_NAME: "tlc_green_trips"
+ INPUT_CSV_HEADERS: >-
+ ["vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code",
+ "pickup_location_id", "dropoff_location_id", "passenger_count", "trip_distance", "fare_amount",
+ "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee",
+ "imp_surcharge", "total_amount", "payment_type", "trip_type", "congestion_surcharge" ]
+ DATA_DTYPES: >-
+ { "vendor_id": "str",
+ "pickup_datetime": "datetime64[ns]",
+ "dropoff_datetime": "datetime64[ns]",
+ "store_and_fwd_flag": "str",
+ "rate_code": "str",
+ "pickup_location_id": "str",
+ "dropoff_location_id": "str",
+ "passenger_count": "str",
+ "trip_distance": "float64",
+ "fare_amount": "float64",
+ "extra": "float64",
+ "mta_tax": "float64",
+ "tip_amount": "float64",
+ "tolls_amount": "float64",
+ "ehail_fee": "float64",
+ "imp_surcharge": "float64",
+ "total_amount": "float64",
+ "payment_type": "str",
+ "trip_type": "str",
+ "congestion_surcharge": "float64" }
+ OUTPUT_CSV_HEADERS: >-
+ [ "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code",
+ "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax",
+ "tip_amount", "tolls_amount", "ehail_fee", "total_amount", "payment_type",
+ "distance_between_service", "time_between_service", "trip_type", "imp_surcharge", "pickup_location_id",
+ "dropoff_location_id", "data_file_year", "data_file_month" ]
+ resources:
+ request_memory: "12G"
+ request_cpu: "1"
+ request_ephemeral_storage: "16G"
+ - operator: "GKEStartPodOperator"
+ description: "Run CSV transform within kubernetes pod"
+ args:
+ task_id: "yellow_trips"
+ startup_timeout_seconds: 600
+ name: "load_tlc_yellow_trips"
+ namespace: "default"
+ project_id: "{{ var.value.gcp_project }}"
+ location: "us-central1-c"
+ cluster_name: new-york-taxi-trips
+ image_pull_policy: "Always"
+ image: "{{ var.json.new_york_taxi_trips.container_registry.run_csv_transform_kub }}"
+ env_vars:
+ SOURCE_URL: "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_source_url }}"
+ SOURCE_FILE: "files/data_yellow_trips.csv"
+ TARGET_FILE: "files/data_output_yellow_trips.csv"
+ DATA_FILE_YEAR_FIELD: "data_file_year"
+ DATA_FILE_MONTH_FIELD: "data_file_month"
+ PROJECT_ID: "{{ var.value.gcp_project }}"
+ DATASET_ID: "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_dataset_id }}"
+ TABLE_ID: "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_table_id }}"
+ SCHEMA_PATH: "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_schema_path }}"
+ CHUNKSIZE: "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_chunk_size }}"
+ TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}"
+ TARGET_GCS_PATH: "{{ var.json.new_york_taxi_trips.container_registry.yellow_trips_target_gcs_path }}"
+ PIPELINE_NAME: "tlc_yellow_trips"
+ INPUT_CSV_HEADERS: >-
+ [ "vendor_id", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance",
+ "rate_code", "store_and_fwd_flag", "pickup_location_id", "dropoff_location_id",
+ "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount",
+ "tolls_amount", "imp_surcharge", "total_amount", "congestion_surcharge" ]
+ DATA_DTYPES: >-
+ { "vendor_id": "str",
+ "pickup_datetime": "datetime64[ns]",
+ "dropoff_datetime": "datetime64[ns]",
+ "passenger_count": "str",
+ "trip_distance": "float64",
+ "rate_code": "str",
+ "store_and_fwd_flag": "str",
+ "pickup_location_id": "str",
+ "dropoff_location_id": "str",
+ "payment_type": "str",
+ "fare_amount": "float64",
+ "extra": "float64",
+ "mta_tax": "float64",
+ "tip_amount": "float64",
+ "tolls_amount": "float64",
+ "imp_surcharge": "float64",
+ "total_amount": "float64",
+ "congestion_surcharge": "float64" }
+ OUTPUT_CSV_HEADERS: >-
+ [ "vendor_id", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance",
+ "rate_code", "store_and_fwd_flag", "payment_type", "fare_amount", "extra",
+ "mta_tax", "tip_amount", "tolls_amount", "imp_surcharge", "total_amount",
+ "pickup_location_id", "dropoff_location_id", "data_file_year", "data_file_month" ]
+ - operator: "GKEDeleteClusterOperator"
+ args:
+ task_id: "delete_cluster"
+ project_id: "{{ var.value.gcp_project }}"
+ location: "us-central1-c"
+ name: new-york-taxi-trips
+ graph_paths:
+ - "create_cluster >> [ green_trips, yellow_trips ] >> delete_cluster"