# Wedge Task 2
___
The second task of this project involves sampling a set of card holders and their transactions in the wedge data. 
### Connect to GBQ
___

In [1]:
from google.cloud import bigquery
import random
import pandas as pd

In [2]:
client = bigquery.Client()

### Build a List of Owners
___


In [3]:
# Query the unique card numbers
query = """
    SELECT DISTINCT card_no
    FROM `wedge-to-the-cloud.wedge_to_the_dataset.transactions`
    WHERE card_no != 3
"""

# Execute the query
query_job = client.query(query)

# Convert the query result into a list
card_nos = [row.card_no for row in query_job]

### Take a Sample of the Owners
___

In [4]:
# Get a sample of 175 card numbers
sample_card_nos = random.sample(card_nos, 175)

# Convert the sampled card numbers to a comma-separated string
sample_card_nos_str = ', '.join(str(card) for card in sample_card_nos)

sample_card_nos_str

'16384, 35266, 16671, 52518, 15783, 48680, 51795, 50317, 40998, 42428, 18035, 18294, 19902, 49256, 49476, 17724, 42378, 23664, 24567, 17367, 10809, 42393, 19091, 13863, 65324, 30264, 11967, 25863, 14078, 16672, 49214, 18038, 26770, 52625, 21753, 14284, 17288, 65306, 13721, 44276, 50093, 47522, 24222, 25600, 42868, 16575, 38716, 14349, 52693, 23013, 64285, 16662, 19735, 11487, 24197, 48812, 11276, 35705, 22991, 25152, 21532, 22789, 16513, 48015, 15929, 49846, 50560, 19370, 30243, 30341, 11541, 21353, 18168, 37587, 14659, 23050, 52254, 35020, 10137, 24110, 25115, 42269, 48381, 40987, 14042, 19527, 16278, 17153, 38616, 44629, 12898, 14607, 42490, 11602, 49255, 16106, 49443, 18904, 44880, 65739, 25267, 24806, 11596, 18083, 51427, 21621, 10202, 23322, 34307, 15988, 16451, 16038, 49173, 36339, 42464, 40718, 20661, 63433, 10557, 65920, 40237, 46387, 26896, 51472, 13124, 44399, 50308, 20495, 17261, 20748, 21986, 18334, 16101, 18380, 14055, 11010, 12302, 18586, 51759, 16832, 52066, 51034, 25464

### Extract Records and Save Locally
___

In [5]:
# Query the sample of card numbers
query = f"""
    SELECT *
    FROM `wedge-to-the-cloud.wedge_to_the_dataset.transactions`
    WHERE card_no IN ({sample_card_nos_str})
"""

# Execute the query
query_job = client.query(query)

# Convert the query to a list
results = [dict(row) for row in query_job]

In [6]:
# Convert the results into a pandas DataFrame
results_df = pd.DataFrame(results)

In [7]:
# Get the memory usage in bytes
memory_bytes = results_df.memory_usage(deep=True).sum()

# Convert bytes to megabytes (1 MB = 1024 * 1024 bytes)
memory_mb = memory_bytes / (1024 ** 2)

print(f"DataFrame size: {memory_mb:.2f} MB")

DataFrame size: 280.37 MB


In [8]:
# Save to a txt file in the data folder
results_df.to_csv('data/card_no_sample.txt', index=False, sep= ',')