In [None]:
!pip install google-cloud-bigquery

In [1]:
from google.cloud import bigquery

In [2]:
client = bigquery.Client()

In [3]:
project_id = 'dataanalytics-347914'

In [5]:
# Perform a query.
QUERY = (
    f'''SELECT * 
    FROM `{project_id}.retail.orders` 
    LIMIT 10'''
    )
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

In [6]:
type(rows)

google.cloud.bigquery.table.RowIterator

In [7]:

for row in rows:
    print(row.order_id)

In [8]:
table_id = f"{project_id}.retail.orders"

In [9]:
help(bigquery.LoadJobConfig)

Help on class LoadJobConfig in module google.cloud.bigquery.job.load:

class LoadJobConfig(google.cloud.bigquery.job.base._JobConfig)
 |  LoadJobConfig(**kwargs) -> None
 |  
 |  Configuration options for load jobs.
 |  
 |  Set properties on the constructed configuration by using the property name
 |  as the name of a keyword argument. Values which are unset or :data:`None`
 |  use the BigQuery REST API default values. See the `BigQuery REST API
 |  reference documentation
 |  <https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad>`_
 |  for a list of default values.
 |  
 |  Required options differ based on the
 |  :attr:`~google.cloud.bigquery.job.LoadJobConfig.source_format` value.
 |  For example, the BigQuery API's default value for
 |  :attr:`~google.cloud.bigquery.job.LoadJobConfig.source_format` is ``"CSV"``.
 |  When loading a CSV file, either
 |  :attr:`~google.cloud.bigquery.job.LoadJobConfig.schema` must be set or
 |  :attr:`~google.cloud.bigque

LoadJobConfig helps to define the columns' data type that is going to be uploaded to the BigQuery table. 

In [11]:
job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("order_id", "INTEGER"),
        bigquery.SchemaField("order_date", "TIMESTAMP"),
        bigquery.SchemaField("order_customer_id", "INTEGER"),
        bigquery.SchemaField("order_status", "STRING")
    ],
)


In [12]:
!gsutil ls gs://airetail_mld/retail_db/orders

gs://airetail_mld/retail_db/orders/part-00000


In [15]:
help(client.load_table_from_uri)

Help on method load_table_from_uri in module google.cloud.bigquery.client:

load_table_from_uri(source_uris: Union[str, Sequence[str]], destination: Union[google.cloud.bigquery.table.Table, google.cloud.bigquery.table.TableReference, google.cloud.bigquery.table.TableListItem, str], job_id: str = None, job_id_prefix: str = None, location: str = None, project: str = None, job_config: google.cloud.bigquery.job.load.LoadJobConfig = None, retry: google.api_core.retry.Retry = <google.api_core.retry.Retry object at 0x7fb048c771f0>, timeout: Optional[float] = None) -> google.cloud.bigquery.job.load.LoadJob method of google.cloud.bigquery.client.Client instance
    Starts a job for loading data into a table from Cloud Storage.
    
    See
    https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload
    
    Args:
        source_uris (Union[str, Sequence[str]]):
            URIs of data files to be loaded; in format
            ``gs://<bucket_name>/<object_name_or_glob>

load_table_from_uri helps loading data from GCS to BigQuery. It required 3 inputs

1) uri
2) table_id: table name
3) job_config: here it goes the LoadJobConfig instance with the schema definition of the data to upload

In [16]:
uri = "gs://airetail_mld/retail_db/orders/part-00000"

# create job
load_job = client.load_table_from_uri(
    uri, table_id, job_config=job_config
)  # Make an API request.

# execute job
load_job.result()  # Wait for the job to complete.

LoadJob<project=dataanalytics-347914, location=US, id=d0b763a7-cc78-4f07-ae6e-97d2b800243e>

In [17]:

# gets reference to table object, but it only has its metadata
table = client.get_table(table_id)
print("Loaded {} rows to table {}".format(table.num_rows, table_id))

Loaded 68883 rows to table dataanalytics-347914.retail.orders


In [36]:
# Perform a query.
QUERY = (
    f'''SELECT * FROM `{project_id}.retail.orders` LIMIT 10'''
)
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row.order_id)

42834
42851
42864
42888
42890
42896
42898
42899
42906
42913


The query() method returns an interator that returns table.Row objects which has many attributes

In [31]:
# Perform a query.
QUERY = (
    f'''SELECT * FROM `{project_id}.retail.orders` LIMIT 10'''
)
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(type(row))

<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>
<class 'google.cloud.bigquery.table.Row'>


To get the row data, we can use the values() method

In [49]:
# Perform a query.
QUERY = (
    f'''SELECT * FROM `{project_id}.retail.orders` LIMIT 10'''
)
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

In [50]:
data = [r.values() for r in rows]
data

[(42834,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  4321,
  'CLOSED'),
 (42851,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  5972,
  'CLOSED'),
 (42864,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  6024,
  'CLOSED'),
 (42888,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  3712,
  'CLOSED'),
 (42890,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  1961,
  'CLOSED'),
 (42896,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  7705,
  'CLOSED'),
 (42898,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  7770,
  'CLOSED'),
 (42899,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  3119,
  'CLOSED'),
 (42906,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  6215,
  'CLOSED'),
 (42913,
  datetime.datetime(2014, 4, 16, 0, 0, tzinfo=datetime.timezone.utc),
  10692,
  'CLOSED')]

In [38]:
# Perform a query.
QUERY = (f'''
    SELECT order_status, count(*) AS order_count
    FROM `{project_id}.retail.orders`
    GROUP BY 1
    ORDER BY 2 DESC
''')
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row.values())

('COMPLETE', 22899)
('PENDING_PAYMENT', 15030)
('PROCESSING', 8275)
('PENDING', 7610)
('CLOSED', 7556)
('ON_HOLD', 3798)
('SUSPECTED_FRAUD', 1558)
('CANCELED', 1428)
('PAYMENT_REVIEW', 729)
