Task 1: Data Filtering

In [None]:
import dask.dataframe as dd
from langdetect import detect, LangDetectException
import pandas as pd

# CONFIGURATION
# Establish input and output files
INPUT_FILE = '../data/weighted_score_above_08.csv' 
OUTPUT_FILE = '../data/filtered_reviews.csv'

# STEP 1: LOAD DATA
# Read the pre-filtered CSV that only contains reviews with helpfulness > 0.8
# the columns we will be looking at
cols_to_read = [
    'language', 
    'review', 
    'voted_up', 
    'weighted_vote_score', 
    'votes_funny', 
    'steam_purchase', 
    'received_for_free', 
    'written_during_early_access', 
    'author_playtime_at_review', 
    'author_num_games_owned', 
    'author_num_reviews', 
    'game'
]

df = dd.read_csv(
    INPUT_FILE, 
    usecols=cols_to_read,
    dtype={
        'votes_funny': 'float',
        'weighted_vote_score': 'float',
        'author_playtime_at_review': 'float',
        'author_num_games_owned': 'float',
        'author_num_reviews': 'float',
        'voted_up': 'object'       # Reads True/False safely
    },
    quotechar='"', 
    doublequote=True,
    on_bad_lines='skip'
)

# STEP 2: FILTER FOR ENGLISH
# use the language column in the steam dataset to filter to english-only reviews
# will remove this column later to reduce redundancy
if 'language' in df.columns:
    print("Column 'language' found. Filtering by metadata...")
    df_english = df[df['language'] == 'english']
else:
    raise ValueError("The 'language' column was not found in the dataset")

# STEP 3: CLEANUP TEXT
# to handle unusaly line terminator issue 
df_english['review'] = df_english['review'].str.replace(r'[\n\r]+', ' ', regex=True)
df_english['review'] = df_english['review'].str.strip()

# STEP 4: SELECT COLUMNS
# Ensure we only keep available columns (prevents errors if 'game' is missing)
final_columns = [c for c in cols_to_read if c != 'language']
df_final = df_english[final_columns]

# STEP 5: SAVE
# Drop NaNs just before saving
df_final = df_final.dropna(subset=['review'])

print(f"Saving filtered English reviews to {OUTPUT_FILE}...")

# compute() triggers the actual processing and saving
df_english.to_csv(OUTPUT_FILE, index=False, single_file=True)

print("Filtering Complete :D")

Column 'language' found. Filtering by metadata...
Saving filtered English reviews to ../data/filtered_reviews.csv...


  df = reader(bio, **kwargs)
  df = reader(bio, **kwargs)


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+----------------------+--------+----------+
| Column               | Found  | Expected |
+----------------------+--------+----------+
| steam_china_location | object | float64  |
+----------------------+--------+----------+

The following columns also raised exceptions on conversion:

- steam_china_location
  ValueError("could not convert string to float: '广东'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'steam_china_location': 'object'}

to the call to `read_csv`/`read_table`.