# Extract Dataset

### Set up environment variables and load necessary libraries

In [None]:
PROJECT = "qwiklabs-gcp-da02053fb2a13c97"  # Replace with your PROJECT

In [None]:
import os
os.environ["PROJECT"] = PROJECT
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'service_account_key.json' # for local ONLY

In [None]:
def create_query(phase, sample_size):
    basequery = """
    SELECT
        fare_amount,
        EXTRACT(DAYOFWEEK from pickup_datetime) AS dayofweek,
        EXTRACT(HOUR from pickup_datetime) AS hourofday,
        pickuplon,
        pickuplat,
        dropofflon,
        dropofflat,
        trips_last_5min
    FROM
        `taxifare.traffic`
    WHERE
        trip_distance > 0
        AND fare_amount >= 2.5
        AND pickuplon > -78
        AND pickuplon < -70
        AND dropofflon > -78
        AND dropofflon < -70
        AND pickuplat > 37
        AND pickuplat < 45
        AND dropofflat > 37
        AND dropofflat < 45
        AND passenger_count > 0
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N)) = 1
    """

    if phase == "TRAIN":
        subsample = """
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) >= (EVERY_N * 0)
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) <  (EVERY_N * 70)
        """
    elif phase == "VALID":
        subsample = """
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) >= (EVERY_N * 70)
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) <  (EVERY_N * 85)
        """
    elif phase == "TEST":
        subsample = """
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) >= (EVERY_N * 85)
        AND ABS(MOD(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING)), EVERY_N * 100)) <  (EVERY_N * 100)
        """

    query = basequery + subsample
    return query.replace("EVERY_N", sample_size)

## Write to CSV

In [None]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

for phase in ["TRAIN", "VALID", "TEST"]:
    # 1. Create query string
    query_string = create_query(phase, "5000")
    # 2. Load results into DataFrame
    df = bq.query(query_string).to_dataframe()

    # 3. Write DataFrame to CSV
    df.to_csv("taxi-{}.csv".format(phase.lower()), index_label = False, index = False)
    print("Wrote {} lines to {}".format(len(df), "taxi-{}.csv".format(phase.lower())))

Note that even with a 1/5000th sample we have a good amount of data for ML. 150K training examples and 30K validation.

### Verify that datasets exist 

In [None]:
!ls -l *.csv

### Preview one of the files

In [None]:
!head taxi-train.csv