# Steam Reviews Data Collection

**Dying Light 2: The Beast - Review Scraper**

This notebook demonstrates how to scrape Steam game reviews for training data collection.

---

## Setup

First, let's import the required libraries and modules.

In [None]:
# Auto-reload modules (useful during development)
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Add src to path (for Google Colab compatibility)
if 'google.colab' in sys.modules:
    # In Colab, mount drive and navigate to project
    from google.colab import drive
    drive.mount('/content/drive')
    # Update this path to your project location in Google Drive
    project_path = Path('/content/drive/MyDrive/szkolenie_techland')
    sys.path.insert(0, str(project_path))
else:
    # Local execution
    project_path = Path('.').parent
    sys.path.insert(0, str(project_path))

print(f"Project path: {project_path}")

In [None]:
# Import our custom modules
from src.scraper.steam_api import SteamReviewScraper, quick_scrape
from src.scraper.utils import (
    save_to_formats,
    get_review_statistics,
)
from src.utils.config import (
    DYING_LIGHT_BEAST_APP_ID,
    RAW_DATA_DIR,
)

# Standard libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("✅ All modules imported successfully!")

---

## 1. Explore Steam API

Let's first test the API connection and see what a single page of reviews looks like.

In [None]:
# Initialize scraper for Dying Light 2: The Beast
scraper = SteamReviewScraper(app_id=DYING_LIGHT_BEAST_APP_ID)

print(f"Steam API Endpoint: {scraper.endpoint}")
print(f"App ID: {scraper.app_id}")

In [None]:
# Fetch a single page of reviews to explore the structure
print("Fetching first page of negative reviews...\n")

batch = scraper.fetch_review_page(
    cursor="*",
    review_type="negative",
    language="english",
    num_per_page=10,  # Small sample
)

print(f"✅ Successfully fetched {len(batch)} reviews")
print(f"\nQuery Summary:")
print(f"  Total reviews in database: {batch.query_summary.total_reviews:,}")
print(f"  Positive: {batch.query_summary.total_positive:,}")
print(f"  Negative: {batch.query_summary.total_negative:,}")
print(f"  Review score: {batch.query_summary.review_score_desc}")

In [None]:
# Examine the first review in detail
if batch.reviews:
    review = batch.reviews[0]
    
    print("Sample Review:")
    print("=" * 70)
    print(f"Review ID: {review.recommendationid}")
    print(f"Sentiment: {review.sentiment} (voted_up={review.voted_up})")
    print(f"Language: {review.language}")
    print(f"Playtime: {review.playtime_hours} hours")
    print(f"Votes Up: {review.votes_up}")
    print(f"Created: {review.created_date}")
    print(f"\nReview Text:")
    print("-" * 70)
    print(review.review)
    print("=" * 70)

---

## 2. Scrape Reviews

Now let's scrape a larger dataset of reviews. We'll start with a small sample for testing.

In [None]:
# Configuration for scraping
MAX_REVIEWS = 1000  # Start with 1k for testing, change to 100000 for full scrape
REVIEW_TYPE = "negative"  # Focus on negative reviews
LANGUAGE = "english"

print(f"Configuration:")
print(f"  Target reviews: {MAX_REVIEWS:,}")
print(f"  Review type: {REVIEW_TYPE}")
print(f"  Language: {LANGUAGE}")

In [None]:
# Scrape reviews
print("Starting review scraping...\n")

reviews = scraper.scrape_reviews(
    max_reviews=MAX_REVIEWS,
    review_type=REVIEW_TYPE,
    language=LANGUAGE,
    save_checkpoints=True,
    checkpoint_interval=5000,
)

print(f"\n✅ Scraped {len(reviews):,} reviews!")

---

## 3. Quick Statistics

Let's analyze the scraped data.

In [None]:
# Get statistics
stats = get_review_statistics(reviews)
scraper_stats = scraper.get_stats_summary()

print("Review Statistics:")
print("=" * 50)
for key, value in stats.items():
    print(f"  {key}: {value}")

print("\nScraper Performance:")
print("=" * 50)
for key, value in scraper_stats.items():
    print(f"  {key}: {value}")

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame([r.to_dict_simplified() for r in reviews])

print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Data info
df.info()

### Visualizations

In [None]:
# Sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sentiment counts
df['sentiment'].value_counts().plot(
    kind='bar',
    ax=axes[0],
    color=['#e74c3c', '#2ecc71']
)
axes[0].set_title('Review Sentiment Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Votes distribution
df['votes_up'].hist(bins=30, ax=axes[1], color='#3498db', edgecolor='black')
axes[1].set_title('Distribution of Helpful Votes', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Votes Up')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Playtime analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Playtime distribution
playtime_filtered = df[df['playtime_hours'] < 200]  # Filter outliers for better viz
playtime_filtered['playtime_hours'].hist(
    bins=40,
    ax=axes[0],
    color='#9b59b6',
    edgecolor='black'
)
axes[0].set_title('Playtime Distribution (< 200h)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Playtime (hours)')
axes[0].set_ylabel('Frequency')

# Playtime by sentiment
df.boxplot(
    column='playtime_hours',
    by='sentiment',
    ax=axes[1],
    patch_artist=True
)
axes[1].set_title('Playtime by Sentiment', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Sentiment')
axes[1].set_ylabel('Playtime (hours)')
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

In [None]:
# Review length analysis
df['review_length'] = df['review_text'].str.len()

fig, ax = plt.subplots(figsize=(12, 6))

# Filter very long reviews for better visualization
df_filtered = df[df['review_length'] < 2000]

df_filtered.boxplot(
    column='review_length',
    by='sentiment',
    ax=ax,
    patch_artist=True
)
ax.set_title('Review Length by Sentiment (< 2000 chars)', fontsize=14, fontweight='bold')
ax.set_xlabel('Sentiment')
ax.set_ylabel('Review Length (characters)')
plt.suptitle('')

plt.tight_layout()
plt.show()

print(f"Average review length:")
print(df.groupby('sentiment')['review_length'].describe())

In [None]:
# Top voted reviews (most helpful)
print("Top 5 Most Helpful Negative Reviews:")
print("=" * 70)

top_reviews = df.nlargest(5, 'votes_up')

for idx, row in top_reviews.iterrows():
    print(f"\n[{row['votes_up']} votes | {row['playtime_hours']}h played]")
    print(f"{row['review_text'][:300]}...")
    print("-" * 70)

In [None]:
# Timeline of reviews
df['created_date'] = pd.to_datetime(df['created_date'])
df['date'] = df['created_date'].dt.date

reviews_by_date = df.groupby('date').size()

fig, ax = plt.subplots(figsize=(14, 6))
reviews_by_date.plot(kind='line', ax=ax, color='#e67e22', linewidth=2)
ax.set_title('Reviews Over Time', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Number of Reviews')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## 4. Export Data

Save the scraped reviews to multiple formats for future use.

In [None]:
# Define output path
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_name = f"dying_light_beast_{REVIEW_TYPE}_{timestamp}"
output_base = RAW_DATA_DIR / output_name

print(f"Saving reviews to: {output_base}")

# Save to multiple formats
saved_files = save_to_formats(
    reviews,
    output_base,
    formats=['json', 'csv', 'parquet']
)

print("\n✅ Data exported successfully!")
print("\nSaved files:")
for fmt, path in saved_files.items():
    print(f"  {fmt.upper()}: {path}")

---

## 5. Sample Reviews for Manual Inspection

In [None]:
# Random sample of reviews for manual inspection
sample = df.sample(10, random_state=42)

print("Random Sample of 10 Reviews:")
print("=" * 70)

for idx, row in sample.iterrows():
    print(f"\n{row['sentiment'].upper()} | {row['playtime_hours']}h | {row['votes_up']} votes")
    print(f"{row['review_text'][:400]}")
    print("-" * 70)

---

## Summary

### What We Accomplished:

1. ✅ Connected to Steam API and fetched review data
2. ✅ Scraped reviews with rate limiting and error handling
3. ✅ Analyzed review statistics and patterns
4. ✅ Visualized data distributions
5. ✅ Exported data in multiple formats (JSON, CSV, Parquet)

### Next Steps:

1. **Data Cleaning** (`02_data_cleaning.ipynb`): Remove duplicates, filter low-quality reviews, balance dataset
2. **Classification Setup**: Define categories and prepare evaluation dataset
3. **LLM Classification**: Start with iteration 1 - basic prompting

---

### Notes:

- To scrape more reviews, increase `MAX_REVIEWS` in the configuration cell
- Checkpoints are saved automatically every 5,000 reviews
- You can resume scraping by using the `resume_from_checkpoint=True` parameter