The objective of the final data pipeline is parsing each column of the CSV file into the correct data type and save the new data as Parquet file.

# 1. `games_description.csv`

The file wraps all columns as a string. Attempt to parse the data type from the start did not work.

Schema:
```
name: string
short_description: string
long_description: string
genres: object (array[string])
minimum_system_requirement: object (struct[string])
recommend_system_requirement: object (struct[string])
release_date: date
developer: object (array[string])
publisher: object (array[string])
overall_player_rating: categorical
number_of_reviews_from_purchased_people: int32
number_of_english_reviews: int32
link: string
```

In [74]:
import polars as pl
from pathlib import Path
import re

local_dir = Path("/teamspace/studios/this_studio/Steam-RecSys/data_pipeline/data")


def parse_reviews(value):
    if "%" in value:
        # Extract percentage and total number
        match = re.search(r"(\d+)% of ([\d,]+)", value)
        if match:
            percentage = int(match.group(1))
            total = int(match.group(2).replace(",", ""))
            return int((percentage / 100) * total)
    else:
        # Extract the number directly
        match = re.search(r"\(([\d,]+)\)", value)
        if match:
            return int(match.group(1).replace(",", ""))


def parse_system_requirements(requirements_list):
    result = {}
    for item in requirements_list:
        if ":" in item:
            key, value = item.split(":")[:2]
            result[key.strip()] = value.strip()
    return result


df = pl.read_csv(local_dir / "games_description.csv")
df = df.with_columns(
    pl.col("genres").str.replace_many(["]", "'", "["], "").str.split(", "),
    pl.col("number_of_english_reviews").str.replace_all(",", "").cast(pl.Int32),
    pl.col(["minimum_system_requirement", "recommend_system_requirement"])
    .str.replace_many(["]", "'", "["], "")
    .str.split(", ")
    .map_elements(parse_system_requirements, return_dtype=pl.Object),
    pl.col(["developer", "publisher"])
    .str.replace_many(["]", "'", "["], "")
    .str.split(", "),
    pl.col("overall_player_rating").cast(pl.Categorical("lexical")),
    pl.when(pl.col("release_date").str.contains(r"\d{1,2} \w{3}, \d{4}"))
    .then(pl.col("release_date").str.to_date("%d %b, %Y", strict=False))
    .otherwise(pl.col("release_date").str.to_date("%b %Y", strict=False))
    .alias("release_date"),
    pl.col("number_of_reviews_from_purchased_people").map_elements(
        parse_reviews, return_dtype=pl.Int32
    ),
)

df

name,short_description,long_description,genres,minimum_system_requirement,recommend_system_requirement,release_date,developer,publisher,overall_player_rating,number_of_reviews_from_purchased_people,number_of_english_reviews,link
str,str,str,list[str],object,object,date,list[str],list[str],cat,i32,i32,str
"""Black Myth: Wukong""","""Black Myth: Wukong is an actio…","""About This Game  Black M…","[""Mythology"", ""Action RPG"", … ""Violent""]","{'OS': 'Windows 10 64-bit', 'Processor': 'Intel Core i5-8400 / AMD Ryzen 5 1600', 'Memory': '16 GB RAM', 'Graphics': 'NVIDIA GeForce GTX 1060 6GB / AMD Radeon RX 580 8GB', 'DirectX': 'Version 11', 'Storage': '130 GB available space', 'Sound Card': 'Windows Compatible Audio Device', 'Additional Notes': 'HDD Supported'}","{'OS': 'Windows 10 64-bit', 'Processor': 'Intel Core i7-9700 / AMD Ryzen 5 5500', 'Memory': '16 GB RAM', 'Graphics': 'NVIDIA GeForce RTX 2060 / AMD Radeon RX 5700 XT / INTEL Arc A750', 'DirectX': 'Version 12', 'Storage': '130 GB available space', 'Sound Card': 'Windows Compatible Audio Device', 'Additional Notes': 'SSD Required. The above specifications were tested with DLSS/FSR/XeSS enabled.'}",2024-08-19,"[""Game Science""]","[""Game Science""]","""Overwhelmingly Positive""",654820,51931,"""https://store.steampowered.com…"
"""Counter-Strike 2""","""For over two decades, Counter-…","""About This Game  For ove…","[""FPS"", ""Shooter"", … ""Moddable""]","{'OS': 'Windows® 10', 'Processor': '4 hardware CPU threads - Intel® Core™ i5 750 or higher', 'Memory': '8 GB RAM', 'Graphics': 'Video card must be 1 GB or more and should be a DirectX 11-compatible with support for Shader Model 5.0', 'DirectX': 'Version 11', 'Storage': '85 GB available space'}","{'OS': 'Windows® 10', 'Processor': '4 hardware CPU threads - Intel® Core™ i5 750 or higher', 'Memory': '8 GB RAM', 'Graphics': 'Video card must be 1 GB or more and should be a DirectX 11-compatible with support for Shader Model 5.0', 'DirectX': 'Version 11', 'Storage': '85 GB available space'}",2012-08-21,"[""Valve""]","[""Valve""]","""Very Positive""",8313603,2258990,"""https://store.steampowered.com…"
"""Warhammer 40,000: Space Marine…","""Embody the superhuman skill an…","""About This Game Embody the sup…","[""Warhammer 40K"", ""Action"", … ""Futuristic""]","{'OS': 'Windows 10 (1903 min)/11 64-bit', 'Processor': 'AMD Ryzen 5 2600X / Intel Core i5-8600K', 'Memory': '8 GB RAM', 'Graphics': '6 GB VRAM', 'DirectX': 'Version 12', 'Storage': '75 GB available space', 'Additional Notes': 'SSD required. 30 FPS in 1920x1080 with ""Low"" preset.'}","{'OS': 'Windows 10 (1903 min)/11 64-bit', 'Processor': 'AMD Ryzen 7 5800X / Intel Core i7-12700', 'Memory': '16 GB RAM', 'Graphics': '8 GB VRAM', 'DirectX': 'Version 12', 'Storage': '75 GB available space', 'Additional Notes': 'SSD required. 60 FPS in 1920x1080 with the ""Ultra"" preset.'}",2024-09-09,"[""Saber Interactive""]","[""Focus Entertainment""]","""Very Positive""",50860,51920,"""https://store.steampowered.com…"
"""Cyberpunk 2077""","""Cyberpunk 2077 is an open-worl…","""About This Game Cyberpunk 2077…","[""Cyberpunk"", ""Open World"", … ""Immersive Sim""]","{'OS': '64-bit Windows 10', 'Processor': 'Core i7-6700 or Ryzen 5 1600', 'Memory': '12 GB RAM', 'Graphics': 'GeForce GTX 1060 6GB or Radeon RX 580 8GB or Arc A380', 'DirectX': 'Version 12', 'Storage': '70 GB available space', 'Additional Notes': 'SSD required. Attention'}","{'OS': '64-bit Windows 10', 'Processor': 'Core i7-12700 or Ryzen 7 7800X3D', 'Memory': '16 GB RAM', 'Graphics': 'GeForce RTX 2060 SUPER or Radeon RX 5700 XT or Arc A770', 'DirectX': 'Version 12', 'Storage': '70 GB available space', 'Additional Notes': 'SSD required.'}",2020-12-10,"[""CD PROJEKT RED""]","[""CD PROJEKT RED""]","""Very Positive""",680264,324124,"""https://store.steampowered.com…"
"""ELDEN RING""","""THE CRITICALLY ACCLAIMED FANTA…","""About This Game THE CRITICALLY…","[""Souls-like"", ""Dark Fantasy"", … ""Family Friendly""]","{'OS': 'Windows 10', 'Processor': 'INTEL CORE I5-8400 or AMD RYZEN 3 3300X', 'Memory': '12 GB RAM', 'Graphics': 'NVIDIA GEFORCE GTX 1060 3 GB or AMD RADEON RX 580 4 GB', 'DirectX': 'Version 12', 'Storage': '60 GB available space', 'Sound Card': 'Windows Compatible Audio Device', 'Additional Notes': ''}","{'OS': 'Windows 10/11', 'Processor': 'INTEL CORE I7-8700K or AMD RYZEN 5 3600X', 'Memory': '16 GB RAM', 'Graphics': 'NVIDIA GEFORCE GTX 1070 8 GB or AMD RADEON RX VEGA 56 8 GB', 'DirectX': 'Version 12', 'Storage': '60 GB available space', 'Sound Card': 'Windows Compatible Audio Device', 'Additional Notes': ''}",2022-02-25,"[""FromSoftware"", ""Inc.""]","[""FromSoftware"", ""Inc."", ""Bandai Namco Entertainment""]","""Very Positive""",705261,491741,"""https://store.steampowered.com…"
…,…,…,…,…,…,…,…,…,…,…,…,…
"""MXGP PRO""","""Adjust your bike's configurati…","""About This Game  The off…","[""Simulation"", ""Racing"", … ""Offroad""]","{'OS *': 'Windows 7 64-Bit or later', 'Processor': 'Intel Core i5-2500,\\xa0 AMD FX-8100 or equivalent', 'Memory': '8 GB RAM', 'Graphics': 'NVIDIA GeForce GTX 760 with 2 GB VRAM or more / AMD Radeon HD 7950 with 2 GB VRAM or more', 'DirectX': 'Version 11', 'Storage': '15 GB available space', 'Sound Card': 'DirectX compatible', 'Additional Notes': '*Laptop versions of graphics cards may work but are not officially supported.'}","{'OS *': 'Windows 7 64-Bit or later', 'Processor': 'Intel Core\\xa0i7-2600', 'Memory': '16 GB RAM', 'Graphics': 'NVIDIA GeForce GTX 960 with 4 GB VRAM or more | AMD Radeon R9 380 with 4 GB VRAM or more', 'DirectX': 'Version 11', 'Storage': '15 GB available space', 'Sound Card': 'DirectX compatible', 'Additional Notes': '*Laptop versions of graphics cards may work but are not officially supported.'}",2018-06-29,"[""Milestone S.r.l.""]","[""Milestone S.r.l.""]","""Very Positive""",777,433,"""https://store.steampowered.com…"
"""MXGP 2019 - The Official Motoc…","""Experience the excitement of t…","""About This Game  The off…","[""Simulation"", ""Racing"", … ""Multiplayer""]","{'OS *': 'Windows 7 64-bit or later versions', 'Processor': 'Intel Core i5 4460 3.2 GHz / AMD A10-7850K', 'Memory': '8 GB RAM', 'Graphics': 'NVIDIA GeForce GTX 760 2GB / AMD Radeon R7 370X 2GB', 'DirectX': 'Version 11', 'Storage': '15 GB available space', 'Sound Card': 'DirectX compatible', 'Additional Notes': '*Laptop versions of graphics cards may work but are not officially supported.'}","{'OS': 'Windows 10 64-bit', 'Processor': 'Intel Core i7-4820K 3.7 GHz / AMD Ryzen 7 1700x', 'Memory': '16 GB RAM', 'Graphics': 'NVIDIA GeForce GTX 1060 6GB / AMD Radeon RX 480 8GB', 'DirectX': 'Version 11', 'Storage': '15 GB available space', 'Sound Card': 'DirectX compatible', 'Additional Notes': '*Laptop versions of graphics cards may work but are not officially supported.'}",2019-08-27,"[""Milestone S.r.l.""]","[""Milestone S.r.l.""]","""Very Positive""",772,446,"""https://store.steampowered.com…"
"""Ride 2""","""The fastest and most iconic bi…","""About This Game  Welcome…","[""Racing"", ""Sports"", … ""Controller""]","{'OS *': 'Windows® 7 SP1 / Windows® 8 / Windows® 8.1 / Windows® 10', 'Processor': 'Intel i5 2500K 3.3GHz / AMD Phenom II X4 850 or equivalent', 'Memory': '4 GB RAM', 'Graphics': 'GeForce GT 640 / Radeon HD 6670 1GB*', 'DirectX': 'Version 10', 'Storage': '33 GB available space', 'Sound Card': 'DirectX compatible', 'Additional Notes': '*Laptop versions of graphics cards may work but are not officially supported.'}","{'OS *': 'Windows® 7 SP1 64-Bit / Windows® 8 64-Bit / Windows® 8.1 64-Bit / Windows® 10', 'Processor': 'Intel Core i5 4670K 3.4 GHz / AMD FX-9590 4.7 GHz or equivalent', 'Memory': '8 GB RAM', 'Graphics': 'GeForce GTX 970 / AMD Radeon R9 390 4GB*', 'DirectX': 'Version 11', 'Storage': '33 GB available space', 'Additional Notes': '*Laptop versions of graphics cards may work but are not officially supported.'}",2016-10-07,"[""Milestone S.r.l.""]","[""Milestone S.r.l.""]","""Very Positive""",2008,1045,"""https://store.steampowered.com…"
"""Tanuki Sunset""","""Master the longboard as you dr…","""About This Game Grab Your Long…","[""Racing"", ""Action"", … ""Casual""]","{'OS *': 'Windows 7 (32-bit)', 'Processor': 'Dual Core 2.4Ghz', 'Memory': '2048 MB RAM', 'Graphics': 'GeForce 8800 GT / AMD HD 6850 / Intel HD Graphics 4400 or above', 'DirectX': 'Version 11', 'Storage': '500 MB available space', 'Sound Card': 'DirectX Compatible Sound Card', 'Additional Notes': 'Gamepad Recommended'}","{'OS *': 'Windows 7 (32-bit)', 'Processor': 'Dual Core 2.4Ghz', 'Memory': '2048 MB RAM', 'Graphics': 'GeForce 8800 GT / AMD HD 6850 / Intel HD Graphics 4400 or above', 'DirectX': 'Version 11', 'Storage': '500 MB available space', 'Sound Card': 'DirectX Compatible Sound Card', 'Additional Notes': 'Gamepad Recommended'}",2020-12-04,"[""Rewind Games""]","[""Rewind Games""]","""Very Positive""",390,451,"""https://store.steampowered.com…"
