# Steam Games Analysis

## 1. Ingest
Merge CSV and JSON chunks into consolidated files.

In [None]:
import pandas as pd, glob, json

# Merge CSV parts
csv_files = sorted(glob.glob('games_part_*.csv'))
if csv_files:
    df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
    df.to_csv('games.csv', index=False)
    print('games.csv created with', len(df), 'rows')
else:
    print('No CSV parts found')

# Merge JSON parts
json_files = sorted(glob.glob('games_json_part_*.json'))
merged = {}
for fp in json_files:
    with open(fp) as f:
        merged.update(json.load(f))
if merged:
    with open('games.json', 'w') as f:
        json.dump(merged, f)
    print('games.json created with', len(merged), 'records')
else:
    print('No JSON parts found')


## 2. Cleaning
Parse dates, prices, and owner ranges.

In [None]:
import pandas as pd, re

# Load merged dataset

df = pd.read_csv("games.csv")
# Standardize column names

df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# Convert release dates

df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

# Convert price field to numeric, handling 'Free'

df["price"] = (
    df["price"].astype(str)
                .str.replace("$", "", regex=False)
                .replace({"Free": "0", "": "0"})
)
df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Parse owner ranges like '20,000 - 50,000'

def parse_owner_range(text):
    if isinstance(text, str):
        text = text.replace(",", "")
        m = re.match(r"(\d+)[^\d]+(\d+)", text)
        if m:
            low, high = int(m.group(1)), int(m.group(2))
            return (low + high) // 2
    return pd.NA

df["estimated_owners_mid"] = df["estimated_owners"].apply(parse_owner_range)

# Drop rows with missing critical fields

df.dropna(subset=["release_date", "price", "estimated_owners_mid"], inplace=True)

# Save cleaned dataset

df.to_csv("games_clean.csv", index=False)
print("games_clean.csv saved with", len(df), "rows")


## 3. Exploration
Display basic statistics and visualizations.

In [None]:
import pandas as pd, matplotlib.pyplot as plt

df = pd.read_csv('games_clean.csv')
print(df[['price', 'estimated_owners_mid']].describe())

df['release_year'] = df['release_date'].dt.year

df.groupby('release_year')['estimated_owners_mid'].mean().plot(kind='bar')
plt.ylabel('Average owners (midpoint)')
plt.title('Average Estimated Owners by Release Year')
plt.tight_layout()
plt.show()
