In [1]:
# Install required libraries (if not already installed)
!pip install google-cloud-bigquery pandas --quiet

In [2]:
# Import necessary libraries
import pandas as pd
from google.cloud import bigquery

In [3]:
# Authenticate with Google account to access BigQuery
from google.colab import auth

try:
    auth.authenticate_user()
    print("🔐 Successfully authenticated!")
except Exception as e:
    print(f"Authentication failed: {e}")

🔐 Successfully authenticated!


In [4]:
# Configure BigQuery client
project_id = 'meu-projeto-dbt'
dataset = 'meu_dataset'
client = bigquery.Client(project=project_id)

In [5]:
# Define the names of the gold tables
# These are the final output tables after data cleaning and transformations
gold_tables = {
    'investment': f"{project_id}.{dataset}.gold_investment",
    'kpi': f"{project_id}.{dataset}.gold_kpi",
    'auxiliary': f"{project_id}.{dataset}.gold_auxiliar"
}

In [6]:
# Display the first rows (head) of each gold table
# Useful for sanity check and quick preview of data structure
for name, table in gold_tables.items():
    print(f"\nTable preview: {name}")
    df = client.query(f"SELECT * FROM `{table}` LIMIT 5").to_dataframe()
    display(df)


Table preview: investment


Unnamed: 0,date,tv,radio,ooh,meta,google,tiktok,display_video
0,2023-02-14,8759.71,1390.35,2306.92,4421.34,10015.09,6613.61,2463.46
1,2023-03-07,7145.45,978.46,2192.76,5137.27,7862.07,5314.27,2160.86
2,2023-06-07,8126.6,1554.39,806.24,4769.09,8078.9,6677.8,2071.34
3,2023-06-24,6187.79,1638.28,649.09,3725.09,6842.62,4282.0,2002.26
4,2023-07-28,0.0,601.41,1111.68,3704.31,4936.4,4055.44,1973.56



Table preview: kpi


Unnamed: 0,date,sales
0,2023-09-20,6014510.61
1,2023-09-21,5618447.61
2,2023-02-22,10252475.06
3,2023-03-13,11335665.6
4,2023-06-05,18619219.45



Table preview: auxiliary


Unnamed: 0,date,holiday
0,2023-03-08,0.0
1,2023-07-01,0.0
2,2024-07-20,0.0
3,2024-08-09,0.0
4,2024-09-16,0.0


In [7]:
# Validate that there are no null values in the `sales` column
df_kpi = client.query(f"SELECT * FROM `{gold_tables['kpi']}`").to_dataframe()
assert df_kpi['sales'].isnull().sum() == 0, "There are still null values in the `sales` column!"
print("The 'sales' column contains no null values.")

The 'sales' column contains no null values.


In [8]:
# Validate that all investment columns are fully populated
df_inv = client.query(f"SELECT * FROM `{gold_tables['investment']}`").to_dataframe()
investment_columns = ['tv', 'radio', 'ooh', 'meta', 'google', 'tiktok', 'display_video']
assert df_inv[investment_columns].isnull().sum().sum() == 0, "There are still null values in investment columns!"
print("The investment columns column contains no null values.")

The investment columns column contains no null values.


In [9]:
# Validate that the `holiday` column is not null (can be 0 or 1, but not null)
df_aux = client.query(f"SELECT * FROM `{gold_tables['auxiliary']}`").to_dataframe()
assert df_aux['holiday'].isnull().sum() == 0, "There are still null values in the `holiday` column!"
print("The `holiday` column contains no null values.")

The `holiday` column contains no null values.


In [10]:
# Check that all sales values are greater than 0
invalid_sales = df_kpi[df_kpi['sales'] <= 0]
assert invalid_sales.empty, f"There are {len(invalid_sales)} rows with `sales` <= 0."
print("All `sales` values are greater than 0.")

All `sales` values are greater than 0.


In [11]:
# Check that all investment values are greater than or equal to 0
for col in investment_columns:
    invalid_rows = df_inv[df_inv[col] < 0]
    assert invalid_rows.empty, f"There are negative values in the `{col}` column."
print("All investment values are greater than or equal to 0 in every column.")

All investment values are greater than or equal to 0 in every column.


In [12]:
# Validate that the `holiday` column contains only binary values (0 or 1)
unique_values = df_aux['holiday'].dropna().astype(int).unique()
if all(str(v) in ['0', '1'] for v in unique_values):
    print("The `holiday` column contains only binary values (0 and 1).")
else:
    raise ValueError(f"The `holiday` column contains non-binary values: {unique_values}")

The `holiday` column contains only binary values (0 and 1).


In [13]:
# Step 10: Join the three gold tables into a single DataFrame
# This allows a unified view for analysis or modeling
query = f"""
SELECT
    inv.date,
    inv.tv, inv.radio, inv.ooh, inv.meta, inv.google, inv.tiktok, inv.display_video,
    kpi.sales,
    aux.holiday
FROM `{gold_tables['investment']}` inv
LEFT JOIN `{gold_tables['kpi']}` kpi ON inv.date = kpi.date
LEFT JOIN `{gold_tables['auxiliary']}` aux ON inv.date = aux.date
"""
df_joined = client.query(query).to_dataframe()

# Convert the `date` column to datetime format to ensure consistency
df_joined['date'] = pd.to_datetime(df_joined['date'], errors='coerce')

# Fill missing values in the `holiday` column with 0 and convert to integer
df_joined['holiday'] = df_joined['holiday'].fillna(0).astype(int)

print("Final joined table created successfully.")
df_joined.sort_values(by='date').head()


Final joined table created successfully.


Unnamed: 0,date,tv,radio,ooh,meta,google,tiktok,display_video,sales,holiday
295,2023-01-01,6588.79,1162.52,990.38,2977.19,8932.64,4556.6,2315.72,35768254.89,1
691,2023-01-02,7834.65,1513.92,1639.37,3026.25,8681.33,3851.36,1966.45,10222134.66,0
131,2023-01-03,7444.58,786.07,1800.9,4366.05,7732.49,3646.35,1977.41,9791019.64,0
645,2023-01-04,6731.67,1431.54,0.0,4129.13,7624.79,3143.86,2372.24,9989567.91,0
594,2023-01-05,5470.03,0.0,0.0,4638.42,6035.7,5192.94,1765.8,6910931.76,0
