In [1]:
import gzip, ast, pandas as pd
from multiprocessing import Pool, cpu_count
import re

In [2]:
def load_python_dict_gz(path):
    rows = []
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(ast.literal_eval(line))
    return pd.DataFrame(rows)


In [None]:
items = pd.read_parquet("items.parquet")
reviews = pd.read_parquet("reviews.parquet")

In [4]:
items.head(3)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."


In [5]:
items.iloc[0]['items']

array([{'item_id': '10', 'item_name': 'Counter-Strike', 'playtime_2weeks': 0, 'playtime_forever': 6},
       {'item_id': '20', 'item_name': 'Team Fortress Classic', 'playtime_2weeks': 0, 'playtime_forever': 0},
       {'item_id': '30', 'item_name': 'Day of Defeat', 'playtime_2weeks': 0, 'playtime_forever': 7},
       {'item_id': '40', 'item_name': 'Deathmatch Classic', 'playtime_2weeks': 0, 'playtime_forever': 0},
       {'item_id': '50', 'item_name': 'Half-Life: Opposing Force', 'playtime_2weeks': 0, 'playtime_forever': 0},
       {'item_id': '60', 'item_name': 'Ricochet', 'playtime_2weeks': 0, 'playtime_forever': 0},
       {'item_id': '70', 'item_name': 'Half-Life', 'playtime_2weeks': 0, 'playtime_forever': 0},
       {'item_id': '130', 'item_name': 'Half-Life: Blue Shift', 'playtime_2weeks': 0, 'playtime_forever': 0},
       {'item_id': '300', 'item_name': 'Day of Defeat: Source', 'playtime_2weeks': 0, 'playtime_forever': 4733},
       {'item_id': '240', 'item_name': 'Counter-Strik

In [6]:
reviews.head(3)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'helpful': 'No ratings yet', 'i..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'helpful': '15 of 20 people (75..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'helpful': 'No ratings yet', 'i..."


In [7]:
reviews.iloc[1]['reviews']

array([{'funny': '', 'helpful': '15 of 20 people (75%) found this review helpful', 'item_id': '251610', 'last_edited': '', 'posted': 'Posted June 24, 2014.', 'recommend': True, 'review': 'I know what you think when you see this title "Barbie Dreamhouse Party" but do not be intimidated by it\'s title, this is easily one of my GOTYs. You don\'t get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can\'t 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what true fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'},
       {'funny': '', 'helpful': '0 of 1 people (0%) found this review helpful', 'item_id': '227300', 'last_edited': '', 'posted': 'Posted September 8, 2013.', 'recommend': True, 'review': "For a simple (it's actually not all that simple bu

In [12]:
items_cop = items.copy()

# explode
items_exp = items_cop.explode("items").reset_index(drop=True)

# normalize dictionaries
item_details = pd.json_normalize(items_exp["items"])
items_flat = pd.concat([items_exp.drop(columns=["items"]), item_details], axis=1)


items_flat["playtime_2weeks"] = items_flat["playtime_2weeks"].fillna(0).astype(int)
items_flat["playtime_forever"] = items_flat["playtime_forever"].fillna(0).astype(int)
items_flat["playtime_hours_lifetime"] = items_flat["playtime_forever"] / 60

In [9]:
reviews_cop = reviews.copy()

reviews_exp = reviews_cop.explode("reviews").reset_index(drop=True)
review_details = pd.json_normalize(reviews_exp["reviews"])
reviews_flat = pd.concat([reviews_exp.drop(columns=["reviews"]), review_details], axis=1)

reviews_flat["recommend"] = reviews_flat["recommend"].astype(bool)
reviews_flat["review"] = reviews_flat["review"].astype(str)

# helpful to numeric
reviews_flat["helpful"] = reviews_flat["helpful"].fillna("No ratings yet")
pattern = r"(?P<helpful_up>\d+)\s+of\s+(?P<helpful_total>\d+)\s+people\s+\((?P<helpful_pct>\d+)%\)\s+found this review helpful"
helpful_parsed = reviews_flat["helpful"].str.extract(pattern)
helpful_parsed = helpful_parsed.astype("float")

# If it's NaN (e.g., "No ratings yet"), set to 0
helpful_parsed = helpful_parsed.fillna(0)

# Optionally cast to int if you prefer integers
helpful_parsed = helpful_parsed.astype(int)

reviews_flat = pd.concat([reviews_flat, helpful_parsed], axis=1)


In [None]:
# playtime_2weeks is +1 every 2 weeks worth of minutes
# playtime_forever is total minutes played
# playtime_hours_lifetime is total hours played
# Just expand out the item info that was in dictionary into columns for easier feature access

items_flat.head(3)

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_2weeks,playtime_forever,playtime_hours_lifetime
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,0,6,0.1
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0,0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,0,7,0.116667


In [None]:
# split up review to columns as well:
# helpful_up / helpful_total =  helpful_pct
reviews_flat.head(5)

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,helpful_up,helpful_total,helpful_pct
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...,0,0,0
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.,0,0,0
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...,0,0,0
3,js41637,http://steamcommunity.com/id/js41637,,15 of 20 people (75%) found this review helpful,251610,,"Posted June 24, 2014.",True,I know what you think when you see this title ...,15,20,75
4,js41637,http://steamcommunity.com/id/js41637,,0 of 1 people (0%) found this review helpful,227300,,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...,0,1,0
