In [1]:
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
import os
import polars as pl
import random

client = bigquery.Client(project="wedge-project-jbangtson")
data_directory = "E:\\College\\Fall 2024\\ADA\\Wedge\\Wedge_Project\\data\\unzipped\\"


In [2]:


# List of columns
columns = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']

# Create an empty Polars DataFrame with the specified columns
gbq_query_df = pl.DataFrame({col: [] for col in columns})

print(gbq_query_df)

shape: (0, 50)
┌──────────┬─────────────┬────────┬──────────┬───┬───────┬────────┬──────────┬──────────┐
│ datetime ┆ register_no ┆ emp_no ┆ trans_no ┆ … ┆ store ┆ branch ┆ match_id ┆ trans_id │
│ ---      ┆ ---         ┆ ---    ┆ ---      ┆   ┆ ---   ┆ ---    ┆ ---      ┆ ---      │
│ null     ┆ null        ┆ null   ┆ null     ┆   ┆ null  ┆ null   ┆ null     ┆ null     │
╞══════════╪═════════════╪════════╪══════════╪═══╪═══════╪════════╪══════════╪══════════╡
└──────────┴─────────────┴────────┴──────────┴───┴───────┴────────┴──────────┴──────────┘


In [3]:
# Function to run a query
def run_query(query):
    try:
        query_job = client.query(query)
        results = query_job.result()

        bytes_processed = query_job.total_bytes_processed
        mb_processed = bytes_processed / (1024 ** 2)
        cost_per_tb = 5.0


        tb_processed = bytes_processed / (1024 ** 4)  # Convert bytes to terabytes
        estimated_cost = tb_processed * cost_per_tb

        # Display the processed data and estimated cost
        print(f"Data processed: {mb_processed:.2f} MB")
        

        print(f"Estimated bytes processed: {bytes_processed}")
        print(f"Estimated cost: ${estimated_cost:.10f}\n\n")

        print(f"Estimated bytes processed against a full year of data: {bytes_processed*50}")
        print(f"Estimated cost against a full year of data: ${estimated_cost*50:.20f}")
        print(f"Estimated cost against a full year of data every 6 hours: ${(estimated_cost*50)*(4*365):.20f}\n\n---------------")

        

        return results
    except GoogleAPIError as e:
        print(f"Error running query: {e}")
        return None

In [4]:
all_card_nums = []

distinct_card_nums = """
SELECT
  DISTINCT(card_no) as card_no
FROM
  `the_wedge_dataset.transArchive_*`
WHERE card_no != 3
"""

# Run the query and display results
results = run_query(distinct_card_nums)
if results:
    for row in results:
        print(f"{row.card_no}")
        all_card_nums.append(row.card_no)


    





Data processed: 634.10 MB
Estimated bytes processed: 664901896
Estimated cost: $0.0030236238


Estimated bytes processed against a full year of data: 33245094800
Estimated cost against a full year of data: $0.15118118790269363672
Estimated cost against a full year of data every 6 hours: $220.72453433793270960450

---------------
48461.0
48326.0
48325.0
48382.0
48317.0
48392.0
48334.0
48316.0
48365.0
48388.0
48318.0
48306.0
48314.0
48301.0
48331.0
48403.0
48459.0
48398.0
48328.0
48361.0
48390.0
48346.0
48381.0
48300.0
48432.0
48291.0
48394.0
48440.0
48451.0
48303.0
48376.0
48339.0
48310.0
48411.0
48299.0
48419.0
48446.0
48396.0
46303.0
43162.0
40766.0
47421.0
36591.0
40878.0
44285.0
42957.0
42257.0
44357.0
37657.0
44786.0
41110.0
44246.0
47843.0
42559.0
47302.0
47857.0
38836.0
43146.0
40558.0
47907.0
41305.0
42798.0
42977.0
43259.0
47850.0
47347.0
39044.0
43262.0
40892.0
36461.0
44428.0
47473.0
44512.0
47293.0
44665.0
45027.0
37523.0
47267.0
44352.0
44789.0
36224.0
44921.0
41171.0
36189

In [7]:


# Select 20 random numbers from the all_card_nums list
random_card_nums = random.sample(all_card_nums, 20)
print(random_card_nums)

# Write the random_card_nums list to a tab-delimited text file
with open('random_card_nums.txt', 'w') as f:
    for num in random_card_nums:
        f.write(f"{num}\t")

[16601.0, 44838.0, 11383.0, 20295.0, 59479.0, 50348.0, 31183.0, 48710.0, 52359.0, 10836.0, 15114.0, 38452.0, 50773.0, 48787.0, 34301.0, 16438.0, 49802.0, 12426.0, 23391.0, 35180.0]


In [8]:
#201609 is baddd


query_df_list = []




for idx, file in enumerate(os.listdir(data_directory)):
  

  owner_gbq_query = f"""
  SELECT
    *
  FROM
    `the_wedge_dataset.{file.split('.')[0]}`
  WHERE card_no = 48289.0 OR card_no = 48420.0 OR card_no = 56191.0 OR card_no = 20300.0 OR card_no = 48996.0 OR card_no = 56191.0 OR card_no = 16551.0 
  """

  results = run_query(owner_gbq_query)


  
  # Process the rows if results exist
  rows = [dict(row) for row in results]
  columns = list(rows[0].keys()) if rows else []

  # Create Polars DataFrame from the rows
  gbq_query_df_inloop = pl.DataFrame(rows







# if results is None or len(results) == 0:
#     print("No results found or query returned None.")
# else:
#     # Process the rows if results exist
#     rows = [dict(row) for row in results]
#     columns = list(rows[0].keys()) if rows else []

#     # Create Polars DataFrame from the rows
#     gbq_query_df = pl.DataFrame(rows)


Data processed: 0.00 MB
Estimated bytes processed: 0
Estimated cost: $0.0000000000


Estimated bytes processed against a full year of data: 0
Estimated cost against a full year of data: $0.00000000000000000000
Estimated cost against a full year of data every 6 hours: $0.00000000000000000000

---------------
Data processed: 1100.05 MB
Estimated bytes processed: 1153490331
Estimated cost: $0.0052454667


Estimated bytes processed against a full year of data: 57674516550
Estimated cost against a full year of data: $0.26227333614770032000
Estimated cost against a full year of data every 6 hours: $382.91907077564246719703

---------------


ComputeError: could not append value: 0.0 of type: f64 to the builder; make sure that all rows have the same schema or consider increasing `infer_schema_length`

it might also be that a value overflows the data-type's capacity

In [12]:
gbq_query_df.head(10)

datetime,register_no,emp_no,trans_no,upc,description,trans_type,trans_subtype,trans_status,department,quantity,Scale,cost,unitPrice,total,regPrice,altPrice,tax,taxexempt,foodstamp,wicable,discount,memDiscount,discountable,discounttype,voided,percentDiscount,ItemQtty,volDiscType,volume,VolSpecial,mixMatch,matched,memType,staff,numflag,itemstatus,tenderstatus,charflag,varflag,batchHeaderID,local,organic,display,receipt,card_no,store,branch,match_id,trans_id
null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null


In [10]:
# Write the Polars DataFrame to a tab-delimited text file
gbq_query_df.write_csv('E:\College\Fall 2024\ADA\Wedge\Wedge_Project\data\output_data\owner_sample.txt', separator='\t')

  gbq_query_df.write_csv('E:\College\Fall 2024\ADA\Wedge\Wedge_Project\data\output_data\owner_sample.txt', separator='\t')
