In [15]:
# !pip install --upgrade google-cloud-BigQuery
# !pip install --user --upgrade google-api-python-client
# !pip install --upgrade pandas-gbq

# For ImportError: IProgress not found. Please update jupyter and ipywidgets.

# !pip install --upgrade jupyter
# !pip install --upgrade ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [1]:
from google.cloud import bigquery
import os

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "letsgo-snappy-boulder-378707-4b7d46801fd1.json"

# Construct a BigQuery client object.
client = bigquery.Client()

### Load Track Clear Info 2017-2021

In [5]:
dataset_id = "snappy-boulder-378707.TrackClearInfo"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, timeout=30)
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Created dataset snappy-boulder-378707.TrackClearInfo


In [10]:
years = [2017, 2018, 2019, 2020, 2021]
schema = [
    bigquery.SchemaField("unnamed","INTEGER"),
    bigquery.SchemaField("id","STRING"),
    bigquery.SchemaField("name","STRING"),
    bigquery.SchemaField("popularity","INTEGER"),
    bigquery.SchemaField("explicit","INTEGER"),
    bigquery.SchemaField("available_markets","INTEGER"),
]

for year in years:
    # TODO(developer): Set table_id to the ID of the table to create.
    table_id = "snappy-boulder-378707.TrackClearInfo.TrackClearInfo{}".format(year)
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table, timeout=30)  # Make an API request.
    print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))

Created table snappy-boulder-378707.TrackClearInfo.TrackClearInfo2017
Created table snappy-boulder-378707.TrackClearInfo.TrackClearInfo2018
Created table snappy-boulder-378707.TrackClearInfo.TrackClearInfo2019
Created table snappy-boulder-378707.TrackClearInfo.TrackClearInfo2020
Created table snappy-boulder-378707.TrackClearInfo.TrackClearInfo2021


In [11]:
job_config = bigquery.LoadJobConfig(
    schema = schema,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    autodetect=True,
)

for year in years:
    file_path = '../data/tracks clear info {}.csv'.format(year)
    table_id = "snappy-boulder-378707.TrackClearInfo.TrackClearInfo{}".format(year)

    with open(file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()  # Waits for the job to complete.

    table = client.get_table(table_id)  # Make an API request.
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )

Loaded 40111 rows and 6 columns to snappy-boulder-378707.TrackClearInfo.TrackClearInfo2017
Loaded 36596 rows and 6 columns to snappy-boulder-378707.TrackClearInfo.TrackClearInfo2018
Loaded 27658 rows and 6 columns to snappy-boulder-378707.TrackClearInfo.TrackClearInfo2019
Loaded 32418 rows and 6 columns to snappy-boulder-378707.TrackClearInfo.TrackClearInfo2020
Loaded 28024 rows and 6 columns to snappy-boulder-378707.TrackClearInfo.TrackClearInfo2021


### Join to TrackClearInfoTrain

In [13]:
schema = [
    bigquery.SchemaField("id","STRING"),
    bigquery.SchemaField("name","STRING"),
    bigquery.SchemaField("popularity","INTEGER"),
    bigquery.SchemaField("explicit","INTEGER"),
    bigquery.SchemaField("available_markets","INTEGER"),
]

# TODO(developer): Set table_id to the ID of the table to create.
table_id = "snappy-boulder-378707.TrackClearInfo.TrackClearInfoTrain"
table = bigquery.Table(table_id, schema=schema)
table = client.create_table(table, timeout=30)  # Make an API request.
print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))

Created table snappy-boulder-378707.TrackClearInfo.TrackClearInfoTrain


In [3]:
import pandas as pd
import pandas_gbq
%load_ext google.cloud.bigquery

# Set your default project here
pandas_gbq.context.project = 'snappy-boulder-378707'
pandas_gbq.context.dialect = 'standard'

In [14]:
%%bigquery
INSERT INTO snappy-boulder-378707.TrackClearInfo.TrackClearInfoTrain
SELECT * 
FROM
(
    SELECT id, name, popularity, explicit, available_markets
    FROM snappy-boulder-378707.TrackClearInfo.TrackClearInfo2017
    UNION ALL
    SELECT id, name, popularity, explicit, available_markets
    FROM snappy-boulder-378707.TrackClearInfo.TrackClearInfo2018
    UNION ALL
    SELECT id, name, popularity, explicit, available_markets
    FROM snappy-boulder-378707.TrackClearInfo.TrackClearInfo2019
    UNION ALL
    SELECT id, name, popularity, explicit, available_markets
    FROM snappy-boulder-378707.TrackClearInfo.TrackClearInfo2020
    UNION ALL
    SELECT id, name, popularity, explicit, available_markets
    FROM snappy-boulder-378707.TrackClearInfo.TrackClearInfo2021
) a

Query is running:   0%|          |

In [16]:
%%bigquery
DROP TABLE snappy-boulder-378707.TrackClearInfo.TrackClearInfo2017

Query is running:   0%|          |

In [17]:
%%bigquery
DROP TABLE snappy-boulder-378707.TrackClearInfo.TrackClearInfo2018

Query is running:   0%|          |

In [18]:
%%bigquery
DROP TABLE snappy-boulder-378707.TrackClearInfo.TrackClearInfo2019

Query is running:   0%|          |

In [19]:
%%bigquery
DROP TABLE snappy-boulder-378707.TrackClearInfo.TrackClearInfo2020

Query is running:   0%|          |

In [20]:
%%bigquery
DROP TABLE snappy-boulder-378707.TrackClearInfo.TrackClearInfo2021

Query is running:   0%|          |

In [23]:
%%bigquery
ALTER TABLE snappy-boulder-378707.TrackClearInfo.TrackClearInfoTrain
ADD PRIMARY KEY (id) NOT ENFORCED

Query is running:   0%|          |