# Bronze Layer 🥉

## Imports ⬇️

In [10]:
import pandas as pd # Used as our "Excel tool" to create a dataframe or tables
import datetime
import os # used to communicate with our folder file system 
import numpy as np # used for numerical operations

## Set Variables ➡️

In [11]:
# Get current working directory
cwd  = os.getcwd()
print(f'File Path: {cwd}')

# We assume you have a Transactions folder within the same directory of this notebook
folder = '../mario_data'

# data_path = cwd + folder
data_path = folder
print(f'Data Location: {data_path}')


File Path: c:\Users\Hans\OneDrive\Documents\CSULB\TA Job\IS\Mario ETL\To Students
Data Location: ../mario_data


## Get all folders and files 📂

In [12]:
# Get all Folders and Files as a list
folder_files = os.listdir(data_path)
for f in folder_files:
    print(f)

mario_data_20250101.csv
mario_data_20250201.csv
mario_data_20250301.csv
mario_data_20250401.jsonl
mario_data_20250501.jsonl
mario_data_20250601.jsonl


## Take a peek of data 👀

In [13]:
# Create dataframe with pandas
# df_peek = pd.read_csv(data_path + '/Pokemon_Transactions_20240601.csv')
df_peek = pd.read_csv(data_path + f'/{folder_files[0]}')
# Read first 5 rows
df_peek.head()

Unnamed: 0,Player Name,Team,World,Vehicle Type,Companion,Kart Racing Rank,Platforming Rank,Boss Battle Rank,Power-Ups Used,Kart Role,Team Points,Lives Lost,Participation in Battle Mode,Mushroom Cup Participation,Power-Ups Owned,Coins Spent in Toad Town,Levels Completed,Times Hit by Enemies,Primary Game
0,Yoshi,Toad Brigade,Yoshi's Island,Comet Bike,pOLTERPUP,A,A,A,12,Drifter,-34.0,0.0,Yes,No,1-Up Mushroom,64,26,4.26,Mario Tennis Aces
1,PeachK,GREEN CAPS,Donut Plains,Circuit Special,kOOPA tROOPA,C,A,B,16,Drifter,149.0,4.0,No,No,"Red Shell, Super Star",335,40,5.0,Mario Tennis Aces
2,Waluigi,,Yoshi's Island,Biddybuggy,Goomba,D,A,C,26,Blocker,174.0,1.0,No,Yes,Green Shell,182,57,5.5,Mario Kart 8 Deluxe
3,Yoshi,Toad Brigade,Star World,Pipe Frame,Goomba,C,D,A,23,Drifter,-1.0,5.0,No,Yes,1-Up Mushroom,333,84,6.0,Super Mario Bros.
4,Bowser Jr.,Koopa Clan,Mushroom Kingdom,Pipe Frame,tOAD,C,C,B,10,Blocker,28.0,2.0,Yes,No,"Red Shell, Banana Peel, Fire Flower",461,55,7.0,Super Mario World


## Combine all csvs 🗃️

In [14]:
# Create empty variable to add all data frames into a list
df_list = [] # adding multiple df to list is the most efficient way to combine/append! 🤯

for file in os.listdir(data_path):
    print(file)
    if file.endswith('.jsonl'):
        json_path = os.path.join(data_path, file)
        temp_df = pd.read_json(json_path)
    elif file.endswith('.csv'):
        csv_path = os.path.join(data_path, file)
        temp_df = pd.read_csv(csv_path)

    df_list.append(temp_df)

df = pd.concat(df_list)

# Concatenate all files once iteration is complete
# We don't want to start from 0 as an index for each file. The combinations should have its own index for the rows
df = pd.concat(df_list, ignore_index=True) 
print('Completed Dataframe Concatenation')

mario_data_20250101.csv
mario_data_20250201.csv
mario_data_20250301.csv
mario_data_20250401.jsonl
mario_data_20250501.jsonl
mario_data_20250601.jsonl
Completed Dataframe Concatenation
Completed Dataframe Concatenation


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48000 entries, 0 to 47999
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Player Name                   38960 non-null  object 
 1   Team                          45040 non-null  object 
 2   World                         48000 non-null  object 
 3   Vehicle Type                  48000 non-null  object 
 4   Companion                     48000 non-null  object 
 5   Kart Racing Rank              38720 non-null  object 
 6   Platforming Rank              48000 non-null  object 
 7   Boss Battle Rank              48000 non-null  object 
 8   Power-Ups Used                48000 non-null  int64  
 9   Kart Role                     48000 non-null  object 
 10  Team Points                   48000 non-null  float64
 11  Lives Lost                    48000 non-null  float64
 12  Participation in Battle Mode  48000 non-null  object 
 13  M

## Save Bronze Layer ⬇️

In [16]:
df.to_parquet('../Medallion Architecture/bronze/bronze_transactions.parquet', index=False)

In [17]:
# --- Data Integrity Checks ---

def get_metrics(df):
    metrics = {}
    metrics['rows'] = df.shape[0]
    metrics['columns'] = df.shape[1]
    metrics['unique_counts'] = {}
    metrics['sums'] = {}
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            metrics['unique_counts'][col] = df[col].nunique()
        elif np.issubdtype(df[col].dtype, np.number):
            metrics['sums'][col] = df[col].sum()
    return metrics

# Collect metrics for each file before combining
source_metrics = []
for file in os.listdir(data_path):
    if file.endswith('.jsonl'):
        temp_df = pd.read_json(os.path.join(data_path, file))
    elif file.endswith('.csv'):
        temp_df = pd.read_csv(os.path.join(data_path, file))
    else:
        continue
    source_metrics.append({'file': file, 'metrics': get_metrics(temp_df)})

# Metrics for combined dataframe
combined_metrics = get_metrics(df)

print('--- Source File Metrics ---')
for entry in source_metrics:
    print(f"File: {entry['file']}")
    print(entry['metrics'])
    print()

print('--- Combined DataFrame Metrics ---')
print(combined_metrics)

--- Source File Metrics ---
File: mario_data_20250101.csv
{'rows': 8000, 'columns': 19, 'unique_counts': {'Player Name': 315, 'Team': 16, 'World': 99, 'Vehicle Type': 70, 'Companion': 14, 'Kart Racing Rank': 5, 'Platforming Rank': 5, 'Boss Battle Rank': 5, 'Kart Role': 344, 'Participation in Battle Mode': 2, 'Mushroom Cup Participation': 2, 'Power-Ups Owned': 259, 'Primary Game': 70}, 'sums': {'Power-Ups Used': np.int64(139165), 'Team Points': np.float64(397857.62790900003), 'Lives Lost': np.float64(20591.496549000003), 'Coins Spent in Toad Town': np.int64(1990255), 'Levels Completed': np.int64(500313), 'Times Hit by Enemies': np.float64(40307.642886)}}

File: mario_data_20250201.csv
{'rows': 8000, 'columns': 19, 'unique_counts': {'Player Name': 83, 'Team': 16, 'World': 75, 'Vehicle Type': 70, 'Companion': 14, 'Kart Racing Rank': 5, 'Platforming Rank': 5, 'Boss Battle Rank': 5, 'Kart Role': 863, 'Participation in Battle Mode': 2, 'Mushroom Cup Participation': 2, 'Power-Ups Owned': 259,