# Steam Games Analysis

## 1. Ingest
Merge CSV and JSON chunks into consolidated files.

In [None]:
import pandas as pd, glob, json

# Merge CSV parts
csv_files = sorted(glob.glob('games_part_*.csv'))
if csv_files:
    df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
    df.to_csv('games.csv', index=False)
    print('games.csv created with', len(df), 'rows')
else:
    print('No CSV parts found')

# Merge JSON parts
json_files = sorted(glob.glob('games_json_part_*.json'))
merged = {}
for fp in json_files:
    with open(fp) as f:
        merged.update(json.load(f))
if merged:
    with open('games.json', 'w') as f:
        json.dump(merged, f)
    print('games.json created with', len(merged), 'records')
else:
    print('No JSON parts found')


## 2. Cleaning
Parse dates, prices, and owner ranges.

In [None]:
import pandas as pd, re

# Load merged dataset

df = pd.read_csv("games.csv")
# Standardize column names

df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# Convert release dates

df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

# Convert price field to numeric, handling 'Free'

df["price"] = (
    df["price"].astype(str)
                .str.replace("$", "", regex=False)
                .replace({"Free": "0", "": "0"})
)
df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Parse owner ranges like '20,000 - 50,000'

def parse_owner_range(text):
    if isinstance(text, str):
        text = text.replace(",", "")
        m = re.match(r"(\d+)[^\d]+(\d+)", text)
        if m:
            low, high = int(m.group(1)), int(m.group(2))
            return (low + high) // 2
    return pd.NA

df["estimated_owners_mid"] = df["estimated_owners"].apply(parse_owner_range)

# Drop rows with missing critical fields

df.dropna(subset=["release_date", "price", "estimated_owners_mid"], inplace=True)

# Save cleaned dataset

df.to_csv("games_clean.csv", index=False)
print("games_clean.csv saved with", len(df), "rows")


## 3. Exploration
Display basic statistics and visualizations.

In [None]:
import pandas as pd, matplotlib.pyplot as plt

df = pd.read_csv('games_clean.csv')
print(df[['price', 'estimated_owners_mid']].describe())

df['release_year'] = df['release_date'].dt.year

df.groupby('release_year')['estimated_owners_mid'].mean().plot(kind='bar')
plt.ylabel('Average owners (midpoint)')
plt.title('Average Estimated Owners by Release Year')
plt.tight_layout()
plt.show()


## 4. Exploratory Analysis
Summarize genres, price ranges, and release timing. Visualize correlations and inspect outliers.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

_df = pd.read_csv('games_clean.csv')

# Genre distribution
genre_counts = _df['genres'].dropna().str.split(',').explode().str.strip().value_counts()
print('Top genres:
', genre_counts.head(10))

# Price range distribution
price_bins = pd.cut(_df['price'], bins=[0,5,10,30,100], include_lowest=True, labels=['$0-5','$5-10','$10-30','$30+'])
print('Price range counts:
', price_bins.value_counts().sort_index())

# Release timing
_df['release_year'] = _df['release_date'].dt.year
_df['release_quarter'] = _df['release_date'].dt.to_period('Q')
print('Releases by year:
', _df['release_year'].value_counts().sort_index().tail())

fig, axs = plt.subplots(2,2, figsize=(12,10))
genre_counts.head(10).plot(kind='bar', ax=axs[0,0])
axs[0,0].set_title('Top Genres')
axs[0,0].set_xlabel('')

price_bins.value_counts().sort_index().plot(kind='bar', ax=axs[0,1])
axs[0,1].set_title('Price Ranges')
axs[0,1].set_xlabel('Price Bin')

_df.groupby('release_year')['estimated_owners_mid'].mean().plot(ax=axs[1,0])
axs[1,0].set_title('Avg Owners by Year')
axs[1,0].set_xlabel('Year')
axs[1,0].set_ylabel('Avg Owners')

sns.scatterplot(data=_df, x='price', y='estimated_owners_mid', ax=axs[1,1])
axs[1,1].set_title('Price vs Owners')
plt.tight_layout()
plt.show()

corr = _df[['price', 'estimated_owners_mid']].join(pd.get_dummies(price_bins)).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

outliers = _df[_df['estimated_owners_mid'] > _df['estimated_owners_mid'].quantile(0.99)]
print('Sample outliers:
', outliers[['name','price','estimated_owners_mid']].head())


## 5. Feature Engineering
Derive release year/month, price tiers, and basic genre encoding.

In [None]:

import pandas as pd

df = pd.read_csv('games_clean.csv')

# Release year and month
release_dt = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = release_dt.dt.year
df['release_month'] = release_dt.dt.month

# Price tiers
bins = [-0.01, 0, 10, 30, float('inf')]
labels = ['free', '<$10', '$10-30', '>$30']
df['price_tier'] = pd.cut(df['price'], bins=bins, labels=labels)

# Simplify genres and one-hot encode for modeling
# take the first genre listed
if 'genres' in df.columns:
    df['main_genre'] = df['genres'].str.split(',').str[0]
    genre_dummies = pd.get_dummies(df['main_genre'], prefix='genre')
    df = pd.concat([df, genre_dummies], axis=1)

print(df[['price', 'price_tier', 'release_year', 'release_month', 'main_genre']].head())


## 6. Modeling
Predict owner estimates using price, genre, and release timing.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import KMeans

# Load cleaned data
model_df = pd.read_csv("games_clean.csv")

# Ensure engineered features exist
rel = pd.to_datetime(model_df["release_date"], errors="coerce")
model_df["release_year"] = rel.dt.year
model_df["release_month"] = rel.dt.month
model_df["main_genre"] = model_df.get("genres", "").str.split(',').str[0]

# One-hot encode genre
genre_dummies = pd.get_dummies(model_df["main_genre"], prefix="genre")
X = pd.concat([model_df[["price", "release_year", "release_month"]], genre_dummies], axis=1).fillna(0)
y = model_df["estimated_owners_mid"]

# --- Regression ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
print("Linear MAE", mean_absolute_error(y_test, pred))

# --- Classification: high vs low owners ---
y_class = (y >= y.median()).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Classification accuracy", clf.score(X_test, y_test))

# --- Clustering ---
km = KMeans(n_clusters=3, random_state=42)
clusters = km.fit_predict(X)
model_df["cluster"] = clusters
print(model_df["cluster"].value_counts())
