# Steam Games Analysis

## 1. Ingest
Merge CSV and JSON chunks into consolidated files.

In [None]:
import pandas as pd, glob, json

# Merge CSV parts
csv_files = sorted(glob.glob('games_part_*.csv'))
if csv_files:
    df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
    df.to_csv('games.csv', index=False)
    print('games.csv created with', len(df), 'rows')
else:
    print('No CSV parts found')

# Merge JSON parts
json_files = sorted(glob.glob('games_json_part_*.json'))
merged = {}
for fp in json_files:
    with open(fp) as f:
        merged.update(json.load(f))
if merged:
    with open('games.json', 'w') as f:
        json.dump(merged, f)
    print('games.json created with', len(merged), 'records')
else:
    print('No JSON parts found')


## 2. Cleaning
Parse dates, prices, and owner ranges.

In [None]:
import pandas as pd

df = pd.read_csv('games.csv')

# Convert release_date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Convert price from string to numeric
df['price'] = (df['price'].str.replace('$', '', regex=False).astype(float))

# Parse owner ranges like '20,000 .. 50,000' to midpoint
def parse_owners(x):
    try:
        low, high = x.split(' .. ')
        low, high = low.replace(',', ''), high.replace(',', '')
        return (int(low) + int(high)) // 2
    except Exception:
        return pd.NA

df['estimated_owners_mid'] = df['estimated_owners'].apply(parse_owners)

df.to_csv('games_clean.csv', index=False)
print('games_clean.csv saved')


## 3. Exploration
Display basic statistics and visualizations.

In [None]:
import pandas as pd, matplotlib.pyplot as plt

df = pd.read_csv('games_clean.csv')
print(df[['price', 'estimated_owners_mid']].describe())

df['release_year'] = df['release_date'].dt.year

df.groupby('release_year')['estimated_owners_mid'].mean().plot(kind='bar')
plt.ylabel('Average owners (midpoint)')
plt.title('Average Estimated Owners by Release Year')
plt.tight_layout()
plt.show()
