In [3]:
# 01_extract_english_only.py
import pandas as pd
from pathlib import Path

INPUT  = r"D:\SteamProject\steam_reviews.csv"
OUTPUT = r"D:\SteamProject\shared\steam_reviews_english_only.csv"

cols = [
    'app_id', 'app_name', 'review_id', 'language', 'review',
    'timestamp_created', 'recommended', 'votes_helpful', 'votes_funny',
    'weighted_vote_score', 'steam_purchase', 'received_for_free',
    'written_during_early_access'
]

print("Extracting English reviews from 21 M mixed...")
chunks = []

for chunk in pd.read_csv(INPUT, chunksize=100_000, usecols=cols):
    english = chunk[chunk['language'] == 'english'].copy()
    if not english.empty:
        chunks.append(english)

final = pd.concat(chunks, ignore_index=True)
final.to_csv(OUTPUT, index=False)
print(f"EXTRACTED: {len(final):,} English reviews → {OUTPUT}")
print(f"Size: {Path(OUTPUT).stat().st_size / (1024**3):.2f} GB")

Extracting English reviews from 21 M mixed...
EXTRACTED: 9,635,437 English reviews → D:\SteamProject\shared\steam_reviews_english_only.csv
Size: 2.77 GB


In [4]:
# 02_merge_final.py
import pandas as pd

FILE1 = r"D:\SteamProject\shared\steam_reviews_english_only.csv"     # 9.2 M
FILE2 = r"D:\SteamProject\Steam reviews only english.csv"           # 6.4 M
OUTPUT = r"D:\SteamProject\shared\steam_reviews_combined_english.csv"

df1 = pd.read_csv(FILE1)
df2 = pd.read_csv(FILE2)

common_cols = [
    'app_id', 'app_name', 'review_id', 'language', 'review',
    'timestamp_created', 'recommended', 'votes_helpful', 'votes_funny',
    'weighted_vote_score', 'steam_purchase', 'received_for_free',
    'written_during_early_access'
]

df2 = df2.reindex(columns=common_cols).fillna("")
df2['language'] = 'english'

combined = pd.concat([df1, df2], ignore_index=True)
combined.drop_duplicates(subset=['review_id'], keep='first', inplace=True)

combined.to_csv(OUTPUT, index=False)
print(f"FINAL: {len(combined):,} English reviews → {OUTPUT}")

FINAL: 9,580,669 English reviews → D:\SteamProject\shared\steam_reviews_combined_english.csv
