In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Exploratory Data Analysis with R and BigQuery

**Author**:  [Alok Pattani](https://github.com/alokpattani)

**Last Updated**:  July 2024

## Overview

This notebook illustrates how to perform exploratory data analysis (EDA) using [R](https://www.r-project.org/about.html) on data extracted from [BigQuery](https://cloud.google.com/bigquery). After you analyze and process the data, the transformed data is stored in [Cloud Storage](https://cloud.google.com/storage) for further machine learning (ML) tasks.

R is one of the most widely used programming languages for statistical modeling. It has a large and active community of data scientists and machine learning (ML) professionals. With more than 20,000 packages in the open-source repository of [CRAN](https://cran.r-project.org/), R has tools for all statistical data analysis applications, ML, and visualization.

## Dataset
The dataset used in this tutorial is the BigQuery natality dataset. This public dataset includes information about more than 137 million births registered in the United States from 1969 to 2008. The dataset is available [here](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=samples&t=natality&page=table&_ga=2.99329886.-1705629017.1551465326&_gac=1.109796023.1561476396.CI2rz-z4hOMCFc6RhQods4oEXA).

In this notebook, we focus on exploratory data analysis and visualization using R and BigQuery, with an eye toward a potential machine learning goal of predicting a baby's weight given a number of factors about the pregnancy and about the baby's mother.

## Objective
The goal of this tutorial is to:
1. Query and analyze data from BigQuery using the [bigrquery](https://cran.r-project.org/web/packages/bigrquery/index.html) R library.
2. Prepare and store data for ML in Cloud Storage.

## Costs
This tutorial uses the following billable components of Google Cloud:
1. [BigQuery](https://cloud.google.com/bigquery/pricing)
2. [Cloud Storage](https://cloud.google.com/storage/pricing)
3. [Vertex AI Workbench Instances](https://cloud.google.com/vertex-ai/pricing#notebooks) (if running this notebook there)

Use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## 0. Setup 

Check the version of R being run.

In [None]:
version

Install necessary R packages if not already available in the current session.

In [None]:
# List the necessary packages
needed_packages <- c("dplyr", "ggplot2", "bigrquery")

# Check if packages are installed
installed_packages <- .packages(all.available = TRUE)
missing_packages <- needed_packages[!(needed_packages %in% installed_packages)]

# If any are missing, install them
if (length(missing_packages) > 0) {
  install.packages(missing_packages)
}

In [None]:
# Load the required packages
lapply(needed_packages, library, character.only = TRUE) 

Use BigQuery out-of-band authentication

In [None]:
bq_auth(use_oob = TRUE)

Set a variable to the name of the project that you want to use for this tutorial.

In [None]:
# Set the project ID
PROJECT_ID <- "[YOUR-PROJECT-ID]"

Set a variable to the name of the Cloud Storage bucket that you want to use later to store the output data. The name must be globally unique.

In [None]:
# Set your Cloud Storage bucket name
BUCKET_NAME <- "[YOUR-BUCKET-NAME]"

In [None]:
# Set default height/width for plots generated
options(repr.plot.height = 9, repr.plot.width = 16)

## 1. Querying Data from BigQuery 

### 1.1. Prepare the BigQuery query

In [None]:
sql_query_template <- "
    SELECT
      TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_time_minutes,  

      passenger_count,

      ROUND(trip_distance, 1) AS trip_distance_miles,

      rate_code,
      /* Mapping from rate code to type from description column in BQ table schema */
      (CASE 
        WHEN rate_code = '1.0'
          THEN 'Standard rate'
        WHEN rate_code = '2.0'
          THEN 'JFK'
        WHEN rate_code = '3.0'
          THEN 'Newark'
        WHEN rate_code = '4.0'
          THEN 'Nassau or Westchester'
        WHEN rate_code = '5.0'
          THEN 'Negotiated fare'
        WHEN rate_code = '6.0'
          THEN 'Group ride'
        /* Several NULL AND some '99.0' values go here */
        ELSE 'Unknown'
        END)
        AS rate_type,

      fare_amount,

      CAST(ABS(FARM_FINGERPRINT(
        CONCAT(
          CAST(trip_distance AS STRING), 
          CAST(fare_amount AS STRING)
          )
        ))
        AS STRING)
        AS key

    FROM
      `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2022`

    /* Filter out some outlier or hard to understand values */
    WHERE
      (TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, MINUTE)
        BETWEEN 0.01 AND 120)
      AND
      (passenger_count BETWEEN 1 AND 10)
      AND
      (trip_distance BETWEEN 0.01 AND 100)
      AND
      (fare_amount BETWEEN 0.01 AND 250)

    LIMIT %s
"

### 1.2. Execute the query 
The data will be retreived from BigQuery, and the results will be stored in an in-memory [tibble](https://tibble.tidyverse.org/) (like a data frame).

In [None]:
sample_size <- 10000

sql_query <- sprintf(sql_query_template, sample_size)

taxi_trip_data <- bq_table_download(
    bq_project_query(
        PROJECT_ID, 
        query = sql_query
    )
)

### 1.3. View the query results

In [None]:
# View the query result
head(taxi_trip_data)

In [None]:
# Show # of rows and data types of each column
str(taxi_trip_data)

In [None]:
# View the results summary
summary(taxi_trip_data)

# 2. Visualizing retrieved data

In [None]:
# Display the distribution of fare amounts using a histogram
ggplot(
    data = taxi_trip_data, 
    aes(x = fare_amount)
    ) + 
geom_histogram(bins = 100)

In [None]:
# Display the relationship between trip distance and fare amount
ggplot(
    data = taxi_trip_data, 
    aes(x = trip_distance_miles, y = fare_amount)
    ) + 
geom_point() + 
geom_smooth(method = "lm")

### Performing the processing in BigQuery
Create a function that finds the number of trips and the average fare amount for each value of the chosen column.

In [None]:
get_distinct_value_aggregates <- function(column) {
    query <- paste0(
        'SELECT ', 
          column, 
          ', 
          COUNT(1) AS num_trips,
          AVG(fare_amount) AS avg_fare_amount
        
        FROM
          `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2022`
        
        WHERE
          (TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, MINUTE) 
            BETWEEN 0.01 AND 120)
          AND
          (passenger_count BETWEEN 1 AND 10)
          AND
          (trip_distance BETWEEN 0.01 AND 100)
          AND
          (fare_amount BETWEEN 0.01 AND 250)
        
        GROUP BY 1
        '
        )
    
    bq_table_download(
        bq_project_query(
            PROJECT_ID, 
            query = query
        )
    )
}

Apply the function to get distinct values for various columns and plot them to study patterns.

In [None]:
df <- get_distinct_value_aggregates(
  'TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_time_minutes')

ggplot(
    data = df, 
    aes(x = trip_time_minutes, y = num_trips)
    ) + 
geom_line()

ggplot(
    data = df,
    aes(x = trip_time_minutes, y = avg_fare_amount)
    ) + 
geom_line()

In [None]:
df <- get_distinct_value_aggregates('passenger_count')

ggplot(
    data = df, 
    aes(x = passenger_count, y = num_trips)
    ) + 
geom_col() +
scale_x_continuous(breaks = 1:10)

ggplot(
    data = df, 
    aes(x = passenger_count, y = avg_fare_amount)
    ) + 
geom_col() +
scale_x_continuous(breaks = 1:10)

In [None]:
df <- get_distinct_value_aggregates('ROUND(trip_distance, 0) AS trip_distance_miles')

ggplot(
    data = df, 
    aes(x = trip_distance_miles, y = num_trips)
    ) + 
geom_line()

ggplot(
    data = df,
    aes(x = trip_distance_miles, y = avg_fare_amount)
    ) + 
geom_line()

In [None]:
df <- get_distinct_value_aggregates("
  (CASE 
    WHEN rate_code = '1.0'
      THEN 'Standard rate'
    WHEN rate_code = '2.0'
      THEN 'JFK'
    WHEN rate_code = '3.0'
      THEN 'Newark'
    WHEN rate_code = '4.0'
      THEN 'Nassau or Westchester'
    WHEN rate_code = '5.0'
      THEN 'Negotiated fare'
    WHEN rate_code = '6.0'
      THEN 'Group ride'
    /* Several NULL AND some '99.0' values go here */
    ELSE 'Unknown'
    END)
    AS rate_type
  ")

ggplot(
    data = df,
    aes(x = rate_type, y = num_trips)
    ) + 
geom_col()

ggplot(
    data = df,
    aes(x = rate_type, y = avg_fare_amount)
    ) + 
geom_col()

# 3. Saving the data as CSVs to Cloud Storage

In [None]:
# Prepare training and evaluation data from BigQuery
sample_size <- 10000

sql_query <- sprintf(sql_query_template, sample_size)

# Split data into 75% training, 25% evaluation
train_query <- paste('SELECT * FROM (', sql_query, 
  ') WHERE MOD(CAST(key AS INT64), 100) <= 75')
eval_query <- paste('SELECT * FROM (', sql_query,
  ') WHERE MOD(CAST(key AS INT64), 100) > 75')

# Load training data to data frame
train_data <- bq_table_download(
    bq_project_query(
        PROJECT_ID, 
        query = train_query
    )
)

# Load evaluation data to data frame
eval_data <- bq_table_download(
    bq_project_query(
        PROJECT_ID, 
        query = eval_query
    )
)

In [None]:
print(paste0("Training instances count: ", nrow(train_data)))

print(paste0("Evaluation instances count: ", nrow(eval_data)))

In [None]:
# Write data frames to local CSV files, with headers
dir.create(file.path('data'), showWarnings = FALSE)

write.table(train_data, "data/train_data.csv", 
   row.names = FALSE, col.names = TRUE, sep = ",")

write.table(eval_data, "data/eval_data.csv", 
   row.names = FALSE, col.names = TRUE, sep = ",")

In [None]:
# Upload CSV data to Cloud Storage by passing gsutil commands to system
gcs_url <- paste0("gs://", BUCKET_NAME, "/")

command <- paste("gsutil mb", gcs_url)

system(command)

gcs_data_dir <- paste0("gs://", BUCKET_NAME, "/data")

command <- paste("gsutil cp data/*_data.csv", gcs_data_dir)

system(command)

command <- paste("gsutil ls -l", gcs_data_dir)

system(command, intern = TRUE)