### Load the Data First

In [1]:
from google.cloud import bigquery
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="sql-bigquery-331719-5129dc99895e.json"

In [2]:
# Create a "Client" object
client = bigquery.Client()

In [5]:
# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("openaq", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

In [6]:
# List all the tables in the "hacker_news" dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)

global_air_quality


In [7]:
# Construct a reference to the "full" table
table_ref = dataset_ref.table("global_air_quality")

# API request - fetch the table
table = client.get_table(table_ref)

In [8]:
# Print information on all the columns in the "full" table in the "hacker_news" dataset
table.schema

[SchemaField('location', 'STRING', 'NULLABLE', 'Location where data was measured', (), None),
 SchemaField('city', 'STRING', 'NULLABLE', 'City containing location', (), None),
 SchemaField('country', 'STRING', 'NULLABLE', 'Country containing measurement in 2 letter ISO code', (), None),
 SchemaField('pollutant', 'STRING', 'NULLABLE', 'Name of the Pollutant being measured. Allowed values: PM25, PM10, SO2, NO2, O3, CO, BC', (), None),
 SchemaField('value', 'FLOAT', 'NULLABLE', 'Latest measured value for the pollutant', (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'The datetime at which the pollutant was measured, in ISO 8601 format', (), None),
 SchemaField('unit', 'STRING', 'NULLABLE', 'The unit the value was measured in coded by UCUM Code', (), None),
 SchemaField('source_name', 'STRING', 'NULLABLE', 'Name of the source of the data', (), None),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', 'Latitude in decimal degrees. Precision >3 decimal points.', (), None),
 SchemaF

In [10]:
# Preview the first five lines of the "full" table
client.list_rows(table, max_results=50).to_dataframe()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,co,910.0,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
1,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,no2,131.87,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
2,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,o3,15.57,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
3,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,pm25,45.62,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
4,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,so2,4.49,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
5,"BWSSB Kadabesanahalli, Bengaluru - KSPCB",Bengaluru,IN,co,840.0,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.938906,77.69727,0.25
6,"BWSSB Kadabesanahalli, Bengaluru - KSPCB",Bengaluru,IN,no2,166.55,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.938906,77.69727,0.25
7,BWSSB Kadabesanahalli,Bengaluru,IN,o3,17.11,2017-02-12 01:45:00+00:00,µg/m³,CPCB,12.938906,77.69727,0.25
8,"BWSSB Kadabesanahalli, Bengaluru - KSPCB",Bengaluru,IN,pm25,40.94,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.938906,77.69727,0.25
9,"BWSSB Kadabesanahalli, Bengaluru - KSPCB",Bengaluru,IN,so2,6.63,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.938906,77.69727,0.25


### Write SQL Query Now!!!

In [11]:
# Query to select all the items from the "city" column where the "country" column is 'US'
query = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

In [13]:
# Set up the query
query_job = client.query(query)

In [14]:
# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()

In [15]:
# What five cities have the most measurements?
us_cities.city.value_counts().head()

Phoenix-Mesa-Scottsdale                     88
Houston                                     82
Los Angeles-Long Beach-Santa Ana            68
Riverside-San Bernardino-Ontario            60
New York-Northern New Jersey-Long Island    60
Name: city, dtype: int64

In [16]:
query = """
        SELECT city, country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

In [18]:
# select all columns
query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

In [20]:
# Query to get the score column from every row where the type column has value "job"
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 494748929 bytes.


In [21]:
# Only run the query if it's less than 1 MB
ONE_MB = 1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()

InternalServerError: 500 Query exceeded limit for bytes billed: 1000000. 494927872 or higher required.

(job ID: 15064789-0089-47d9-885e-d2fcf13f03cc)

             -----Query Job SQL Follows-----             

    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:        SELECT score, title
   3:        FROM `bigquery-public-data.hacker_news.full`
   4:        WHERE type = "job" 
   5:        
    |    .    |    .    |    .    |    .    |    .    |

In [22]:
# Only run the query if it's less than 1 GB
ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_GB)

# Set up the query (will only run if it's less than 1 GB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
job_post_scores = safe_query_job.to_dataframe()

# Print average score for job posts
job_post_scores.score.mean()

1.7841263098272444