# dataframe.to_gbq()
### NOTE: Extremely slow dealing with big data

In [None]:
import pandas as pd

In [None]:
project_id = 'PROJECT_ID'

In [None]:
# Sample Data
df = pd.read_csv('resources/data/influxdb_sample.csv', delimiter='\t',
                 parse_dates=['time'],
                 date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%SZ'))
df.head()

In [None]:
df.info()

In [None]:
# Insert data
db = 'testdb'
table = 'test_tbl'
table_id = f'{db}.{table}'

df.to_gbq(table_id, project_id=project_id, if_exists='replace')

In [None]:
# Select data
sql = f'''
SELECT *
FROM `{table_id}`
'''

res_df = pd.read_gbq(sql, project_id=project_id, index_col='time')

In [None]:
res_df.head()

In [None]:
res_df.tail()

# client.load_table_from_file()
### NOTE: Faster dealing with big data

In [1]:
from google.cloud import bigquery as bq

import pandas as pd

In [2]:
# Create service account key:
# https://cloud.google.com/docs/authentication/production#auth-cloud-implicit-python

path = 'resources/credential/Google_BigQuery_credential-xxx.json'
client = bq.Client.from_service_account_json(path)

In [None]:
project_id = 'gbq-trial-254606'

In [None]:
db = 'testdb'
table = 'test_tbl'
table_id = f'{db}.{table}'

# Drop table if exist
client.delete_table(table_id, not_found_ok=True)

In [None]:
dataset_ref = client.dataset(db)

job_config = bq.LoadJobConfig()
job_config.schema = [
    bq.SchemaField("time", "TIMESTAMP"), # NOTE: only canomical format is accepted
    bq.SchemaField("butterflies", "INTEGER"),
    bq.SchemaField("honeybees", "INTEGER"),
    bq.SchemaField("location", "INTEGER"),
    bq.SchemaField("scientist", "STRING"),
]

job_config.field_delimiter = '\t'
job_config.skip_leading_rows = 1 # 0 if no header
job_config.source_format = bq.SourceFormat.CSV

In [None]:
file = f'resources/data/influxdb_sample.csv'

print(f'Reading: {file}')
with open(file, 'rb') as source_file:
    # Insert data
    load_job = client.load_table_from_file(
        source_file, dataset_ref.table(table), job_config=job_config
    )
    print(f'Job ID: {load_job.job_id}')
    
    load_job.result() # Waits for table load to complete.
    print('Job Finished')
    
    destination_table = client.get_table(dataset_ref.table(table))
    print(f'Loaded {destination_table.num_rows} rows')

In [None]:
# Select data
sql = f'''
SELECT *
FROM `{table_id}`
'''

# Fastest, but result is not in dataframe
# query_job = client.query(sql)
# result = query_job.result()

# Medium
query_job = client.query(sql)
res_df = query_job.to_dataframe()

# Slowest
# res_df = pd.read_gbq(sql, project_id=project_id, index_col='time', use_bqstorage_api=True)

In [None]:
res_df.head()

In [None]:
res_df.tail()

# TEST: Import Tick Data

In [None]:
from google.cloud import bigquery as bq

from pathlib import Path
import pandas as pd

import time

In [None]:
# Create service account key:
# https://cloud.google.com/docs/authentication/production#auth-cloud-implicit-python

path = 'resources/credential/Google_BigQuery_credential-xxx.json'
client = bq.Client.from_service_account_json(path)

In [None]:
project_id = 'PROJECT_ID'

db = 'tickdb'
db_id = f'{project_id}.{db}'

table = 'tick_tbl'
table_id = f'{db}.{table}'

In [None]:
# Create db if not exist
if db not in [x.dataset_id for x in list(client.list_datasets())]:
    dataset = bq.Dataset(db_id)
    client.create_dataset(dataset)

In [None]:
# Drop table if exist
client.delete_table(table_id, not_found_ok=True)

In [None]:
dataset_ref = client.dataset(db)

job_config = bq.LoadJobConfig()
job_config.schema = [
    bq.SchemaField("datetime", "TIMESTAMP"),
    bq.SchemaField("bid", "FLOAT"),
    bq.SchemaField("ask", "FLOAT"),
    bq.SchemaField("currency_pair", "STRING"),
]

In [None]:
periods = [f'2019{x+1:02}' for x in range(3)]
chunk_size = 10000

for period in periods:
    currency_pair = 'AUDUSD'
    file = f'resources/data/DAT_ASCII_{currency_pair}_T_{period}.csv'
    print(f'Reading: {file}')
    
    # Transform timestamp to format expected by GBQ
    df_chunks = pd.read_csv(file, sep=',',
                            header=None, names=['datetime', 'bid', 'ask', 'vol'],
                            usecols=['datetime', 'bid', 'ask'],
                            parse_dates=["datetime"],
                            date_parser=lambda x: pd.to_datetime(x, format="%Y%m%d %H%M%S%f"),
                            chunksize=chunk_size)
    
    # Merge df together instead of uploading by batch, as batch insert is much slower
    full_df = pd.concat(df_chunks)
    full_df['currency_pair'] = currency_pair
    
    # Insert to GBQ
    load_job = client.load_table_from_dataframe(full_df, dataset_ref.table(table), job_config=job_config)
    print(f'Job ID: {load_job.job_id}')
    
    load_job.result() # Waits for table load to complete.
    print('Job Finished')
    
    destination_table = client.get_table(dataset_ref.table(table))
    print(f'Loaded {destination_table.num_rows} rows\n')

In [None]:
EXEC_START = time.time()

# Select data
sql = f'''
SELECT *
FROM `{table_id}`
'''

query_job = client.query(sql)
res_df = query_job.to_dataframe()

EXEC_END = time.time()
print(f'\n{EXEC_END - EXEC_START} sec.')

In [None]:
res_df.head()

In [None]:
res_df.tail()