In [5]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load all parquet files with Dask
logger.info("Loading parquet files...")
df = dd.read_parquet("parquet_output_2_extras_with_names/*.parquet")
df = df[df['review_language'] == 'english']
# Check the shape before sampling
total_rows = df.map_partitions(len).compute()
total_rows = total_rows.sum()
logger.info(f"Total rows before sampling: {total_rows}")

# Sample 10% of the data and convert to pandas
# Using random_state for reproducibility
logger.info("Sampling data...")
df_sample = df.sample(frac=0.1, random_state=42).compute()

logger.info(f"Sample shape: {df_sample.shape}")
df_sample.head()

INFO:__main__:Loading parquet files...
INFO:__main__:Total rows before sampling: 17475878
INFO:__main__:Sampling data...
INFO:__main__:Sample shape: (1747112, 29)


Unnamed: 0,name,steam_appid,required_age,is_free,controller_support,detailed_description,about_the_game,short_description,price_overview,metacritic_score,...,author_num_reviews,author_playtime_forever,author_play_time_last_two_weeks,author_playtime_at_review,author_last_played,review,voted_up,votes_up,votes_funny,weighted_vote_score
59466,Counter-Strike,10,0,False,,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,$9.99,88,...,2,9515.0,0.0,9507.0,1387817000.0,"Would recommend anytime, best competetive FPS ...",True,0,1,0.5
63382,Counter-Strike,10,0,False,,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,$9.99,88,...,1,1095.0,0.0,637.0,1539682000.0,[table] [tr]  [td]⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀...,True,1,0,0.509804
52936,Counter-Strike,10,0,False,,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,$9.99,88,...,7,626.0,0.0,371.0,1641059000.0,Getting flamed by 20 year olds is always fun.,True,0,0,0.5
84145,Counter-Strike,10,0,False,,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,$9.99,88,...,14,471.0,4.0,419.0,1743653000.0,csgo on min specs,True,1,0,0.52381
38855,Counter-Strike,10,0,False,,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,$9.99,88,...,1,47189.0,0.0,9965.0,1741425000.0,"Лучший шутер,занимающий лидирующее место с 199...",True,0,0,0.5
