In [None]:
import pandas as pd
import json
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)



The following code loads the JSON data file containing video game information. It will raise an error if the file is not found.


### Downloading and Loading JSON Data from Kaggle 

This code checks if the specified JSON file exists at the given path. If the file is not found, it raises a `FileNotFoundError`. If the file exists, it opens the file, reads its contents, and loads the JSON data into the `data` variable using `json.load()`.


In [None]:
import os
import platform
import subprocess
import shutil

# Set environment variables for Kaggle API
os.environ['KAGGLE_USERNAME'] = "joshuagilodlsu"
os.environ['KAGGLE_KEY'] = "kc50f1dd8ef3c59262be642a6792343ab"

# Download dataset using Kaggle API (force download if needed)
subprocess.run(["kaggle", "datasets", "download", "-d", "fronkongames/steam-games-dataset", "--force"])

# Unzip the downloaded dataset
if platform.system() == "Windows":
    # Windows-specific command to unzip
    subprocess.run(["powershell", "-Command", "Expand-Archive", "steam-games-dataset.zip", "-DestinationPath", "."])
else:
    # Unix-based command to unzip
    subprocess.run(["unzip", "steam-games-dataset.zip"])

json_file_path = "games.json" 
destination_folder = "data/"
destination_path = os.path.join(destination_folder, "games.json")

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

if os.path.exists(json_file_path):
    shutil.move(json_file_path, destination_path)
    print(f"'games.json' moved to {destination_folder}.")
else:
    print(f"Error: '{json_file_path}' not found.")

csv_file_path = "games.csv"
if os.path.exists(csv_file_path):
    os.remove(csv_file_path)
    print("'games.csv' has been deleted.")
else:
    print(f"Error: '{csv_file_path}' not found.")

if os.path.exists("steam-games-dataset.zip"):
    os.remove("steam-games-dataset.zip")
    print("'steam-games-dataset.zip' has been deleted.")


In [2]:
json_file_path = 'data/games.json'
if not os.path.exists(json_file_path):
    raise FileNotFoundError(f"JSON file not found at {json_file_path}")

with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

### Converting JSON Data to a DataFrame

This code converts the JSON data into a Pandas DataFrame, assuming the JSON is structured as a dictionary. It does the following:
- Creates a DataFrame from the dictionary, using the keys as row indexes.
- Resets the index to make it a regular column, renaming it to `AppID`.
- Converts the `AppID` column to a numeric type, coercing any errors to `NaN`, which are then filled with 0 and cast to integers.
- Finally, it prints the first few rows of the DataFrame to inspect the initial structure.


In [3]:
df = pd.DataFrame.from_dict(data, orient='index')
df.reset_index(inplace=True)
df.rename(columns={'index': 'AppID'}, inplace=True)

df['AppID'] = pd.to_numeric(df['AppID'], errors='coerce').fillna(0).astype(int)

print("Initial DataFrame:")
df.head()

Initial DataFrame:


Unnamed: 0,AppID,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,peak_ccu,tags
0,20200,Galactic Bowling,"Oct 21, 2008",0,19.99,0,Galactic Bowling is an exaggerated and stylize...,Galactic Bowling is an exaggerated and stylize...,Galactic Bowling is an exaggerated and stylize...,,https://cdn.akamai.steamstatic.com/steam/apps/...,http://www.galacticbowling.net,,,True,False,False,0,,30,0,,[English],[],"[{'title': 'Buy Galactic Bowling', 'descriptio...",[Perpetual FX Creative],[Perpetual FX Creative],"[Single-player, Multi-player, Steam Achievemen...","[Casual, Indie, Sports]",[https://cdn.akamai.steamstatic.com/steam/apps...,[http://cdn.akamai.steamstatic.com/steam/apps/...,0,,6,11,0 - 20000,0,0,0,0,0,"{'Indie': 22, 'Casual': 21, 'Sports': 21, 'Bow..."
1,655370,Train Bandit,"Oct 12, 2017",0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,THE LAW!! Looks to be a showdown atop a train....,THE LAW!! Looks to be a showdown atop a train....,,https://cdn.akamai.steamstatic.com/steam/apps/...,http://trainbandit.com,,support@rustymoyher.com,True,True,False,0,,12,0,,"[English, French, Italian, German, Spanish - S...",[],"[{'title': 'Buy Train Bandit', 'description': ...",[Rusty Moyher],[Wild Rooster],"[Single-player, Steam Achievements, Full contr...","[Action, Indie]",[https://cdn.akamai.steamstatic.com/steam/apps...,[http://cdn.akamai.steamstatic.com/steam/apps/...,0,,53,5,0 - 20000,0,0,0,0,0,"{'Indie': 109, 'Action': 103, 'Pixel Graphics'..."
2,1732930,Jolt Project,"Nov 17, 2021",0,4.99,0,Jolt Project: The army now has a new robotics ...,Jolt Project: The army now has a new robotics ...,"Shoot vehicles, blow enemies with a special at...",,https://cdn.akamai.steamstatic.com/steam/apps/...,,,ramoncampiaof31@gmail.com,True,False,False,0,,0,0,,"[English, Portuguese - Brazil]",[],"[{'title': 'Buy Jolt Project', 'description': ...",[Campião Games],[Campião Games],[Single-player],"[Action, Adventure, Indie, Strategy]",[https://cdn.akamai.steamstatic.com/steam/apps...,[http://cdn.akamai.steamstatic.com/steam/apps/...,0,,0,0,0 - 20000,0,0,0,0,0,[]
3,1355720,Henosis™,"Jul 23, 2020",0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,HENOSIS™ is a mysterious 2D Platform Puzzler w...,HENOSIS™ is a mysterious 2D Platform Puzzler w...,,https://cdn.akamai.steamstatic.com/steam/apps/...,https://henosisgame.com/,https://henosisgame.com/,info@henosisgame.com,True,True,True,0,,0,0,,"[English, French, Italian, German, Spanish - S...",[],"[{'title': 'Buy Henosis™', 'description': '', ...",[Odd Critter Games],[Odd Critter Games],"[Single-player, Full controller support]","[Adventure, Casual, Indie]",[https://cdn.akamai.steamstatic.com/steam/apps...,[http://cdn.akamai.steamstatic.com/steam/apps/...,0,,3,0,0 - 20000,0,0,0,0,0,"{'2D Platformer': 161, 'Atmospheric': 154, 'Su..."
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,ABOUT THE GAME Play as a hacker who has arrang...,Two Weeks in Painland is a story-driven game a...,,https://cdn.akamai.steamstatic.com/steam/apps/...,https://www.unusual-games.com/home/,https://www.unusual-games.com/contact/,welistentoyou@unusual-games.com,True,True,False,0,,17,0,This Game may contain content not appropriate ...,"[English, Spanish - Spain]",[],[],[Unusual Games],[Unusual Games],"[Single-player, Steam Achievements]","[Adventure, Indie]",[https://cdn.akamai.steamstatic.com/steam/apps...,[http://cdn.akamai.steamstatic.com/steam/apps/...,0,,50,8,0 - 20000,0,0,0,0,0,"{'Indie': 42, 'Adventure': 41, 'Nudity': 22, '..."


### Handling Missing Values and Data Type Conversions

This code performs several operations to clean and prepare the DataFrame for analysis:

- **Missing Values**: It prints the number of missing values for each column.
- **Datetime Conversion**: Converts the `release_date` column to datetime format, coercing invalid entries to `NaT`.
- **Boolean Fields**: Converts the columns `windows`, `mac`, and `linux` into boolean type (`True` or `False`).
- **Numeric Fields**: Iterates over a list of numeric fields (e.g., `required_age`, `price`, `user_score`) and converts them to numeric data types, filling any invalid entries (`NaN`) with 0.

The process ensures that the data types are correctly set for further analysis or modeling.


In [4]:
print("\nMissing values per column:")
missing_values = df.isnull().sum()
print(missing_values)


df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')


boolean_fields = ['windows', 'mac', 'linux']
for col in boolean_fields:
    df[col] = df[col].astype(bool)


numeric_fields = [
    'required_age', 'price', 'dlc_count', 'achievements', 'recommendations',
    'user_score', 'positive', 'negative', 'average_playtime_forever',
    'average_playtime_2weeks', 'median_playtime_forever', 'median_playtime_2weeks',
    'peak_ccu', 'metacritic_score'
]
for col in tqdm(numeric_fields, desc="Converting numeric fields"):
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)



Missing values per column:
AppID                       0
name                        0
release_date                0
required_age                0
price                       0
dlc_count                   0
detailed_description        0
about_the_game              0
short_description           0
reviews                     0
header_image                0
website                     0
support_url                 0
support_email               0
windows                     0
mac                         0
linux                       0
metacritic_score            0
metacritic_url              0
achievements                0
recommendations             0
notes                       0
supported_languages         0
full_audio_languages        0
packages                    0
developers                  0
publishers                  0
categories                  0
genres                      0
screenshots                 0
movies                      0
user_score                  0
score_rank  

Converting numeric fields:   0%|          | 0/14 [00:00<?, ?it/s]

Luckily, the data doesn't contain any missing values.

### Cleaning List and Dictionary Fields

This code processes columns that contain lists or dictionaries to ensure consistent formatting:

- **List Fields**: It iterates over columns such as `developers`, `publishers`, `genres`, and others, checking if the entries are lists. If not, they are replaced with empty lists.
- **Tags Field**: Converts the `tags` column from a dictionary to a list of keys, if it is in dictionary format.
- **Cleaning List Entries**: A helper function `clean_list` is defined to strip any leading/trailing whitespace from strings in lists. The function is applied to clean up all specified list fields and the `tags` field.

This ensures that all list-based columns are standardized and cleaned for further use.


In [5]:

df_game = df[[
    'AppID', 'name', 'release_date', 'required_age', 'about_the_game',
    'website', 'support_url', 'support_email', 'header_image', 'notes'
]].copy()


df_date = df[['release_date']].drop_duplicates().dropna()
df_date['DateID'] = df_date['release_date'].dt.strftime('%Y%m%d').astype(int)
df_date['Day'] = df_date['release_date'].dt.day
df_date['Month'] = df_date['release_date'].dt.month
df_date['Quarter'] = df_date['release_date'].dt.quarter
df_date['Year'] = df_date['release_date'].dt.year
df_date['Weekday'] = df_date['release_date'].dt.day_name()


df_game.rename(columns={
    'name': 'Name',
    'release_date': 'ReleaseDate',
    'required_age': 'RequiredAge',
    'about_the_game': 'AboutGame',
    'website': 'Website',
    'support_url': 'SupportURL',
    'support_email': 'SupportEmail',
    'header_image': 'HeaderImage'
}, inplace=True)


In [6]:

list_fields = ['developers', 'publishers', 'genres', 'categories', 'supported_languages', 'full_audio_languages', 'screenshots', 'movies']
for field in list_fields:
    df[field] = df[field].apply(lambda x: x if isinstance(x, list) else [])


df['tags'] = df['tags'].apply(lambda x: list(x.keys()) if isinstance(x, dict) else [])


def clean_list(lst):
    return [item.strip() for item in lst if isinstance(item, str)]


for field in tqdm(list_fields + ['tags'], desc="Cleaning list fields"):
    df[field] = df[field].apply(clean_list)


Cleaning list fields:   0%|          | 0/9 [00:00<?, ?it/s]

### Parsing and Averaging Estimated Owners

This code processes the `estimated_owners` field, which contains a range of values:

- **Splitting Owner Ranges**: The `estimated_owners` column, which contains strings like "1,000 - 5,000", is cleaned by removing commas and splitting the range into two new columns, `OwnersMin` and `OwnersMax`.
- **Converting to Numeric**: Both `OwnersMin` and `OwnersMax` are converted to numeric data types, with invalid entries replaced by 0.
- **Calculating Average Owners**: A new column `EstimatedOwners` is created, which holds the average of `OwnersMin` and `OwnersMax`.
- **Dropping Temporary Columns**: The intermediate columns `OwnersMin`, `OwnersMax`, and `estimated_owners` are removed from the DataFrame to clean up.

This ensures that the estimated owner range is averaged and stored as a single numeric value for easier analysis.

In [7]:

df[['OwnersMin', 'OwnersMax']] = df['estimated_owners'].str.replace(',', '').str.split(' - ', expand=True)
df['OwnersMin'] = pd.to_numeric(df['OwnersMin'], errors='coerce').fillna(0)
df['OwnersMax'] = pd.to_numeric(df['OwnersMax'], errors='coerce').fillna(0)


df['EstimatedOwners'] = ((df['OwnersMin'] + df['OwnersMax']) / 2).astype(int)


df.drop(['OwnersMin', 'OwnersMax', 'estimated_owners'], axis=1, inplace=True)


### Creating a Game Information DataFrame

This code creates a new DataFrame `df_game` that contains selected columns from the original DataFrame, focusing on key game information:

- **Selected Columns**: The columns such as `AppID`, `name`, `release_date`, `about_the_game`, and others are copied into a new DataFrame.
- **Renaming Columns**: The column names are renamed to a more readable format (e.g., `name` becomes `Name`, `release_date` becomes `ReleaseDate`, and so on).
  
This results in a cleaner and more structured DataFrame containing essential game information for further analysis or display.


In [8]:

df_game = df[[
    'AppID', 'name', 'release_date', 'required_age', 'about_the_game', 'website',
    'support_url', 'support_email', 'header_image', 'notes'
]].copy()


df_game.rename(columns={
    'name': 'Name',
    'release_date': 'ReleaseDate',
    'required_age': 'RequiredAge',
    'about_the_game': 'AboutGame',
    'website': 'Website',
    'support_url': 'SupportURL',
    'support_email': 'SupportEmail',
    'header_image': 'HeaderImage',
    'notes': 'Notes'
}, inplace=True)


### Extracting and Cleaning Entity Data

This code processes several fields that contain lists of entities (e.g., developers, publishers, genres) and creates clean DataFrames for each:

- **Entity Mapping**: A dictionary maps the original field names (e.g., `developers`, `publishers`) to more readable entity names (e.g., `Developer`, `Publisher`).
- **Processing Each Field**: For each field, the following steps are performed:
  - The field is "exploded" (flattened) so each `AppID` is associated with one entity per row.
  - The entities are stripped of any leading or trailing whitespace.
  - A DataFrame of unique entities is created, containing an auto-generated ID and the entity name (e.g., `DeveloperID`, `DeveloperName`).
- **Global DataFrames**: The cleaned entity DataFrames (e.g., `df_developer`, `df_publisher`) are stored in global variables for later use.

This allows the creation of clean, normalized tables for each entity type.


In [9]:

entities = {
    'developers': 'Developer',
    'publishers': 'Publisher',
    'genres': 'Genre',
    'categories': 'Category',
    'supported_languages': 'Language',
    'tags': 'Tag'
}

for field, entity in tqdm(entities.items(), desc="Processing entity tables"):
    df_entity = df[['AppID', field]].explode(field)
    df_entity[field] = df_entity[field].str.strip()
    unique_entities = df_entity[field].dropna().unique()
    df_entity_cleaned = pd.DataFrame(unique_entities, columns=[f'{entity}Name'])
    df_entity_cleaned.reset_index(inplace=True)
    df_entity_cleaned.rename(columns={'index': f'{entity}ID'}, inplace=True)
    globals()[f'df_{entity.lower()}'] = df_entity_cleaned


Processing entity tables:   0%|          | 0/6 [00:00<?, ?it/s]

### Creating Bridge Tables for Many-to-Many Relationships

This code creates bridge tables to handle many-to-many relationships between games and various entities (e.g., developers, publishers, genres):

- **Exploding Fields**: For each entity field (e.g., `developers`, `publishers`), the column is exploded so each `AppID` is associated with a single entity per row.
- **Merging with Entity Tables**: The exploded data is merged with the corresponding entity DataFrame (e.g., `df_developer`, `df_publisher`) to match the entity name with its unique ID.
- **Creating Bridge Tables**: The resulting bridge tables (`bridge_game_developer`, `bridge_game_publisher`, etc.) contain only `AppID` and the corresponding entity ID, with duplicates removed.

These bridge tables facilitate many-to-many relationships between games and their associated entities for use in relational databases.


In [10]:

for field, entity in tqdm(entities.items(), desc="Creating bridge tables"):
    bridge_table = df[['AppID', field]].explode(field)
    bridge_table[field] = bridge_table[field].str.strip()
    bridge_table = bridge_table.merge(globals()[f'df_{entity.lower()}'], left_on=field, right_on=f'{entity}Name', how='left')
    globals()[f'bridge_game_{entity.lower()}'] = bridge_table[['AppID', f'{entity}ID']].drop_duplicates()


Creating bridge tables:   0%|          | 0/6 [00:00<?, ?it/s]

### Creating the Fact Table for Game Metrics

This code creates a fact table `df_fact`, which contains key numerical metrics for each game:

- **Selecting and Renaming Columns**: The relevant columns are selected (e.g., `price`, `peak_ccu`, `average_playtime_forever`) and renamed for clarity (e.g., `price` to `Price`, `peak_ccu` to `PeakCCU`).
- **GameID Assignment**: The `AppID` is duplicated as `GameID` and then removed from the DataFrame to keep the data model consistent.
- **Numeric Conversion**: The selected numeric fields (e.g., `Price`, `EstimatedOwners`, `PositiveReviews`) are converted to numeric types, with invalid entries filled with 0.
- **GameID as Integer**: Ensures the `GameID` column is of integer type for consistency.

This results in a cleaned and properly typed fact table containing the key game metrics for analysis or integration into a data warehouse.


In [11]:

df_fact = df[[
    'AppID', 'price', 'EstimatedOwners', 'peak_ccu',
    'average_playtime_forever', 'average_playtime_2weeks',
    'median_playtime_forever', 'median_playtime_2weeks',
    'positive', 'negative', 'metacritic_score', 'user_score'
]].copy()


df_fact.rename(columns={
    'price': 'Price',
    'peak_ccu': 'PeakCCU',
    'average_playtime_forever': 'AvgPlaytimeForever',
    'average_playtime_2weeks': 'AvgPlaytimeTwoWeeks',
    'median_playtime_forever': 'MedianPlaytimeForever',
    'median_playtime_2weeks': 'MedianPlaytimeTwoWeeks',
    'positive': 'PositiveReviews',
    'negative': 'NegativeReviews',
    'metacritic_score': 'MetacriticScore',
    'user_score': 'UserScore'
}, inplace=True)


df_fact['GameID'] = df_fact['AppID']
df_fact.drop('AppID', axis=1, inplace=True)


numeric_fields_fact = [
    'Price', 'EstimatedOwners', 'PeakCCU', 'AvgPlaytimeForever',
    'AvgPlaytimeTwoWeeks', 'MedianPlaytimeForever', 'MedianPlaytimeTwoWeeks',
    'PositiveReviews', 'NegativeReviews', 'MetacriticScore', 'UserScore'
]

df_fact[numeric_fields_fact] = df_fact[numeric_fields_fact].apply(pd.to_numeric, errors='coerce').fillna(0)


df_fact['GameID'] = df_fact['GameID'].astype(int)


print("\nData types in Fact table:")
print(df_fact.dtypes)



Data types in Fact table:
Price                     float64
EstimatedOwners             int64
PeakCCU                     int64
AvgPlaytimeForever          int64
AvgPlaytimeTwoWeeks         int64
MedianPlaytimeForever       int64
MedianPlaytimeTwoWeeks      int64
PositiveReviews             int64
NegativeReviews             int64
MetacriticScore             int64
UserScore                   int64
GameID                      int64
dtype: object


### Inserting Data into the Database

This code is responsible for populating the database with data from the DataFrame, including games, their associated entities (e.g., developers, publishers), and fact metrics. The following steps are performed:

- **Database Connection and Table Creation**: It connects to the database and creates the necessary tables if they don't already exist.
  
- **Game Insertion**: 
  - The `insert_games()` function collects game data from the `df_game` DataFrame.
  - It handles cases where the `Notes` field may exceed character limits, truncating the value if necessary.
  - It checks for existing entries in the `DimGame` table before inserting new records in batches.

- **Entity Insertion**: 
  - The `insert_entities()` function handles the insertion of related entities (e.g., developers, publishers) and the creation of bridge tables.
  - It checks for duplicates and inserts new records only when necessary, ensuring that relationships between games and entities are properly established.

- **Fact Metrics Insertion**: 
  - The `insert_fact_metrics()` function inserts game metrics (e.g., playtime, reviews, scores) from the `df_fact` DataFrame into the `FactGameMetrics` table.
  - It skips games that already have metrics in the table to avoid duplication.

- **Batch Processing and Error Handling**: 
  - Data is inserted in batches to optimize performance using Peewee's `chunked()` method.
  - Error handling is in place to catch exceptions and handle cases like duplicate entries or missing games.

- **Database Disconnection**: After all data is inserted, the database connection is closed.

This ensures efficient and structured loading of data into the database, handling both dimensional and fact tables with proper relationships.


In [13]:
from peewee import chunked, IntegrityError
from models.models import (
    db, DimGame, DimDeveloper, DimPublisher, DimGenre, DimCategory, DimLanguage, DimTag,
    BridgeGameDeveloper, BridgeGamePublisher, BridgeGameGenre, BridgeGameCategory, BridgeGameLanguage, BridgeGameTag,
    FactGameMetrics
)

if db.is_closed():
    db.connect()

db.create_tables([
    DimGame, DimDeveloper, DimPublisher, DimGenre, DimCategory, DimLanguage, DimTag,
    BridgeGameDeveloper, BridgeGamePublisher, BridgeGameGenre, BridgeGameCategory, BridgeGameLanguage, BridgeGameTag,
    FactGameMetrics
])

def insert_games(df_game, batch_size=100):
    games_to_insert = []
    for _, row in tqdm(df_game.iterrows(), total=len(df_game), desc="Collecting games"):
        try:
                        notes_value = row.get('Notes', None)
            if notes_value is None or pd.isnull(notes_value):
                notes_value = None
            elif len(str(notes_value)) > 65535:                  notes_value = notes_value[:65535]  
                        if not DimGame.get_or_none(DimGame.AppID == row['AppID']):
                games_to_insert.append({
                    'AppID': row['AppID'],
                    'Name': row['Name'],
                    'ReleaseDate': row['ReleaseDate'],
                    'RequiredAge': row['RequiredAge'],
                    'AboutGame': row['AboutGame'],
                    'Website': row['Website'],
                    'SupportURL': row['SupportURL'],
                    'SupportEmail': row['SupportEmail'],
                    'HeaderImage': row['HeaderImage'],
                    'Notes': notes_value
                })
        except Exception as e:
            print(f"Error processing AppID {row['AppID']}: {e}")

        if games_to_insert:
        with db.atomic():
            for batch in chunked(games_to_insert, batch_size):
                DimGame.insert_many(batch).execute()

def insert_entities(df, entity_name, model_class, bridge_class, field_name, batch_size=100):
    bridge_to_insert = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Collecting {entity_name.lower()}s"):
        entities = row[field_name]
        if entities:
            game = DimGame.get_or_none(DimGame.AppID == row['AppID'])
            if game is None:
                continue

            for entity_value in entities:
                try:
                    entity_obj, created = model_class.get_or_create(**{f"{entity_name}Name": entity_value})

                                        if not bridge_class.get_or_none(
                        (bridge_class.GameID == game.GameID) &
                        (getattr(bridge_class, f"{entity_name}ID") == getattr(entity_obj, f"{entity_name}ID"))
                    ):
                        bridge_to_insert.append({
                            'GameID': game.GameID,
                            f'{entity_name}ID': getattr(entity_obj, f"{entity_name}ID")
                        })
                except Exception as e:
                    print(f"Error processing {entity_name} '{entity_value}' for AppID {row['AppID']}: {e}")

        if bridge_to_insert:
        with db.atomic():
            for batch in chunked(bridge_to_insert, batch_size):
                try:
                    bridge_class.insert_many(batch).execute()
                except IntegrityError as e:
                    print(f"Duplicate entry found, skipping batch: {e}")

insert_games(df_game)

insert_entities(df, 'Developer', DimDeveloper, BridgeGameDeveloper, 'developers')

insert_entities(df, 'Publisher', DimPublisher, BridgeGamePublisher, 'publishers')

insert_entities(df, 'Genre', DimGenre, BridgeGameGenre, 'genres')

insert_entities(df, 'Category', DimCategory, BridgeGameCategory, 'categories')

insert_entities(df, 'Language', DimLanguage, BridgeGameLanguage, 'supported_languages')

insert_entities(df, 'Tag', DimTag, BridgeGameTag, 'tags')

def insert_fact_metrics(df_fact, batch_size=100):
    metrics_to_insert = []
    for _, row in tqdm(df_fact.iterrows(), total=len(df_fact), desc="Collecting fact metrics"):
        try:
            game = DimGame.get_or_none(DimGame.AppID == row['GameID'])
            if game is None:
                continue

                        if FactGameMetrics.get_or_none(FactGameMetrics.GameID == game.GameID):
                continue

            metrics_to_insert.append({
                'GameID': game.GameID,
                'Price': row['Price'],
                'EstimatedOwners': row['EstimatedOwners'],
                'PeakCCU': row['PeakCCU'],
                'AvgPlaytimeForever': row['AvgPlaytimeForever'],
                'AvgPlaytimeTwoWeeks': row['AvgPlaytimeTwoWeeks'],
                'MedianPlaytimeForever': row['MedianPlaytimeForever'],
                'MedianPlaytimeTwoWeeks': row['MedianPlaytimeTwoWeeks'],
                'PositiveReviews': row['PositiveReviews'],
                'NegativeReviews': row['NegativeReviews'],
                'MetacriticScore': row['MetacriticScore'],
                'UserScore': row['UserScore']
            })
        except Exception as e:
            print(f"Error processing metrics for GameID {row['GameID']}: {e}")

    if metrics_to_insert:
        with db.atomic():
            for batch in chunked(metrics_to_insert, batch_size):
                FactGameMetrics.insert_many(batch).execute()

insert_fact_metrics(df_fact)

db.close()


Collecting games:   0%|          | 0/97410 [00:00<?, ?it/s]

Collecting developers:   0%|          | 0/97410 [00:00<?, ?it/s]

Duplicate entry found, skipping batch: (1062, "Duplicate entry '1330-1327' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '1750-1728' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '3599-3384' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '6161-5689' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '6497-5971' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '9859-8631' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '12555-10693' for key 'bridgegamedeveloper_GameID_id_DeveloperID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '12627-1073

Collecting publishers:   0%|          | 0/97410 [00:00<?, ?it/s]

Duplicate entry found, skipping batch: (1062, "Duplicate entry '6497-4973' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '9859-7142' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '12555-8762' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '21045-13461' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '22694-8762' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '22924-14412' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '26706-16386' for key 'bridgegamepublisher_GameID_id_PublisherID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '2697

Collecting genres:   0%|          | 0/97410 [00:00<?, ?it/s]

Collecting categorys:   0%|          | 0/97410 [00:00<?, ?it/s]

Duplicate entry found, skipping batch: (1062, "Duplicate entry '472-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '5426-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '18153-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '20896-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '20967-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '22479-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '26555-24' for key 'bridgegamecategory_GameID_id_CategoryID_id'")
Duplicate entry found, skipping batch: (1062, "Duplicate entry '28065-24' for key 'bridgegamecategor

Collecting languages:   0%|          | 0/97410 [00:00<?, ?it/s]

Collecting tags:   0%|          | 0/97410 [00:00<?, ?it/s]

Collecting fact metrics:   0%|          | 0/97410 [00:00<?, ?it/s]

True