## GBQ Queries


In [1]:
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
import os
import polars as pl
import pandas as pd
import random

client = bigquery.Client(project="")
data_directory = "E:\\College\\Fall 2024\\ADA\\Wedge\\Wedge_Project\\data\\unzipped\\"


In [None]:
wedge_task_two = pd.read_csv(f'E:\College\Fall 2024\ADA\Wedge\Wedge_Project\owner_data_folder\_final_owner_data.txt', sep='\t')

Run the code above to view all transactions for card owners: 

48289, 48420, 56191, 20300, 48996, 56191, and 16551.

---

In [None]:


# List of columns
columns = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']

# Create an empty Polars DataFrame with the specified columns
gbq_query_df = pl.DataFrame({col: [] for col in columns})

print(gbq_query_df)

In [4]:
# Function to run a query
def run_query(query):
    try:
        query_job = client.query(query)
        results = query_job.result()

        bytes_processed = query_job.total_bytes_processed
        mb_processed = bytes_processed / (1024 ** 2)
        cost_per_tb = 5.0


        tb_processed = bytes_processed / (1024 ** 4)  # Convert bytes to terabytes
        estimated_cost = tb_processed * cost_per_tb

        # Display the processed data and estimated cost
        print(f"Data processed: {mb_processed:.2f} MB")
        

        print(f"Estimated bytes processed: {bytes_processed}")
        print(f"Estimated cost: ${estimated_cost:.10f}\n\n")

        print(f"Estimated bytes processed against a full year of data: {bytes_processed*50}")
        print(f"Estimated cost against a full year of data: ${estimated_cost*50:.20f}")
        print(f"Estimated cost against a full year of data every 6 hours: ${(estimated_cost*50)*(4*365):.20f}\n\n---------------")

        

        return results
    except GoogleAPIError as e:
        print(f"Error running query: {e}")
        return None

***This next function is a modified version of the function above

In [5]:
# Function to run a query
def run_query2(query):
    try:
        query_job = client.query(query)
        df = query_job.to_dataframe()
        #results = query_job.result()

        bytes_processed = query_job.total_bytes_processed
        mb_processed = bytes_processed / (1024 ** 2)
        cost_per_tb = 5.0


        tb_processed = bytes_processed / (1024 ** 4)  # Convert bytes to terabytes
        estimated_cost = tb_processed * cost_per_tb

        # Display the processed data and estimated cost
        print(f"Data processed: {mb_processed:.2f} MB")
        

        print(f"Estimated bytes processed: {bytes_processed}")
        print(f"Estimated cost: ${estimated_cost:.10f}\n\n")

        print(f"Estimated bytes processed against a full year of data: {bytes_processed*50}")
        print(f"Estimated cost against a full year of data: ${estimated_cost*50:.20f}")
        print(f"Estimated cost against a full year of data every 6 hours: ${(estimated_cost*50)*(4*365):.20f}\n\n---------------")

        

        return df.astype(str)
    except GoogleAPIError as e:
        print(f"Error running query: {e}")
        return None

In [None]:
all_card_nums = []

distinct_card_nums = """
SELECT distinct(card_no)
FROM `the_wedge_dataset.transArchive_*`
WHERE card_no != 3;
"""

# Run the query and display results
results = run_query2(distinct_card_nums)



    





In [11]:
# # Select 20 random numbers from the all_card_nums list

# results = list(results['card_no'])
# random_card_nums = random.sample(results, 100)
# print(random_card_nums)

# Write the random_card_nums list to a tab-delimited text file
# with open('random_card_nums.txt', 'w') as f:
#     for num in random_card_nums:
#         f.write(f"{num}\t")

# Read the random_card_nums file into a list
with open('random_card_nums.txt', 'r') as f:
    random_card_nums = f.read().split('\t')

In [12]:
with open('random_card_nums.txt', 'r') as f:
    random_card_nums_string = f.read()


random_card_nums_string = random_card_nums_string.replace("\t", ", ")

## GBQ Owner Queries

This code defines a function save_owner_query_to_file() that retrieves and saves data from a Google BigQuery dataset to a CSV file. 

The function takes two arguments: yearOfQuery (the year to filter the data by) and table-by-table 
(a boolean flag that controls whether to process each table individually or as a group). 


In [None]:


def save_owner_query_to_file(yearOfQuery, tableByTable=False):
    
  if tableByTable:

    for idx, file in enumerate(os.listdir(data_directory)):
      owner_query_df = pd.DataFrame()
      if f"transArchive_{yearOfQuery}" in file:
      

        owner_gbq_query = f"""
        SELECT
          *,
        SAFE_CAST(Scale AS INT64) AS IntScale
        FROM
          `the_wedge_dataset.transArchive_{yearOfQuery}*`
        WHERE card_no in ({random_card_nums_string}) 
        """

        temp_df = run_query2(owner_gbq_query)

        # #temp_df = pl.DataFrame(results)

        # rows = [dict(row) for row in results]
        # #columns = list(rows[0].keys()) if rows else []

        # # Create Polars DataFrame from the rows
        # temp_df = pl.DataFrame(rows)

        owner_query_df = pd.concat([owner_query_df, temp_df], ignore_index=True)
        owner_query_df.to_csv(f'E:\College\Fall 2024\ADA\Wedge\Wedge_Project\owner_data_folder\owner_data_{yearOfQuery}.txt', sep='\t', index=False)
    
  else:


     for idx, file in enumerate(os.listdir(data_directory)):
      owner_query_df = pd.DataFrame()
      if f"transArchive_{yearOfQuery}" in file:
      

        owner_gbq_query = f"""
        SELECT
          *,
        SAFE_CAST(Scale AS INT64) AS IntScale
        FROM
          `the_wedge_dataset.{file.split('.')[0]}`
        WHERE card_no in ({random_card_nums_string}) 
        """

        temp_df = run_query2(owner_gbq_query)

        # #temp_df = pl.DataFrame(results)

        # rows = [dict(row) for row in results]
        # #columns = list(rows[0].keys()) if rows else []

        # # Create Polars DataFrame from the rows
        # temp_df = pl.DataFrame(rows)

        owner_query_df = pd.concat([owner_query_df, temp_df], ignore_index=True)
        owner_query_df.to_csv(f'E:\College\Fall 2024\ADA\Wedge\Wedge_Project\owner_data_folder\owner_data_{file.split('.')[0]}_version2.txt', sep='\t', index=False)


# if results is None or len(results) == 0:
#     print("No results found or query returned None.")
# else:
#     # Process the rows if results exist
#     rows = [dict(row) for row in results]
#     columns = list(rows[0].keys()) if rows else []

#     # Create Polars DataFrame from the rows
#     gbq_query_df = pl.DataFrame(rows)


In [None]:
for x in range(2010, 2018):
    try:
        save_owner_query_to_file(x, tableByTable=False)
    except Exception as e:
        print(f"Error: {e} with year {x}")
        continue


## Manual Queries for 2015 and 2016

In [None]:
owner_query_df = pd.DataFrame()
random_card_nums_string = random_card_nums_string[:-2]
fileNameForQuery = "transArchive_"

owner_gbq_query = f"""
        SELECT
          *
        FROM
          `the_wedge_dataset.transArchive_*`
        WHERE card_no in ({random_card_nums_string});
        """

temp_df = run_query2(owner_gbq_query)

# #temp_df = pl.DataFrame(results)

# rows = [dict(row) for row in results]
# #columns = list(rows[0].keys()) if rows else []

# # Create Polars DataFrame from the rows
# temp_df = pl.DataFrame(rows)

owner_query_df = pd.concat([owner_query_df, temp_df], ignore_index=True)
owner_query_df.to_csv(f'E:\College\Fall 2024\ADA\Wedge\Wedge_Project\owner_data_folder_redo\owner_data_{fileNameForQuery}.txt', sep='\t', index=False)


In [14]:
#remove comma from last card number


## Containing the Data into a Single Dataframe and File

In [None]:
columns = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id', 'IntScale']

# Create an empty Polars DataFrame with the specified columns
final_owner_df = pd.DataFrame({col: [] for col in columns})



for idx, file in enumerate(os.listdir('owner_data_folder_redo')):
    
    try:
        temp_df = pd.read_csv(f'E:\College\Fall 2024\ADA\Wedge\Wedge_Project\owner_data_folder_redo\{file}', sep='\t')
        
        final_owner_df = pd.concat([final_owner_df, temp_df], axis=0)
    except Exception as e:
        print(f"Error: {e} with file {file}")
        continue




In [65]:
final_owner_df.drop('IntScale', axis=1, inplace=True)

In [None]:
final_owner_df.to_csv('E:\College\Fall 2024\ADA\Wedge\Wedge_Project\owner_data_folder_redo\_final_owner_data.txt', sep='\t', index=False)