In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set locale for consistent date formatting
import locale
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")

# Load datasets
vgsales_union = pd.read_csv("VGSales_Union.csv")
vgsales_intersect = pd.read_csv("VGSales_Intersect.csv")

# Group by Publisher and Summarize
def summarize_publisher_data(df):
    grouped = df.groupby("publisher").agg(
        total_sales=pd.NamedAgg(column="total_sales", aggfunc="sum"),
        na_sales=pd.NamedAgg(column="na_sales", aggfunc="sum"),
        user_score=pd.NamedAgg(column="user_score", aggfunc="mean"),
        meta_score=pd.NamedAgg(column="meta_score", aggfunc="mean"),
        number_of_games=pd.NamedAgg(column="publisher", aggfunc="count")
    ).reset_index()

    grouped = grouped.dropna(subset=["publisher"]).query("publisher != 'Unknown'")
    top_publishers = grouped.nlargest(10, "total_sales")
    top_publishers.to_csv("publisher_sales.csv", index=False)
    return grouped

publisher_df = summarize_publisher_data(vgsales_union)

# Correlation and Visualization
print(publisher_df["number_of_games"].corr(publisher_df["total_sales"]))
sns.scatterplot(data=publisher_df, x="number_of_games", y="total_sales")
plt.title("Number of Games Released vs. Total Sales")
plt.xlabel("Number of Games")
plt.ylabel("Total Sales")
plt.savefig("./images/publisher_games_sales.png")
plt.clf()

print(publisher_df["meta_score"].mean())
print(publisher_df["number_of_games"].corr(publisher_df["meta_score"]))
sns.scatterplot(data=publisher_df, x="number_of_games", y="meta_score")
plt.title("Comparing Meta Score and Number of Games Released")
plt.xlabel("Number of Games")
plt.ylabel("Meta Score")
plt.savefig("./images/Publisher_meta.png")
plt.clf()

# Correlation Comparisons in Intersect Dataset
print(vgsales_intersect.corr(numeric_only=True)["total_sales"])

# Comparing Review Scores and Sales
def calculate_ratios(df):
    na_ratio = df["na_sales"].sum() / df["total_sales"].sum()
    meta_user_diff = df["meta_score"].mean() - (df["user_score"].mean() * 10)
    return na_ratio, meta_user_diff

na_ratio, meta_user_diff = calculate_ratios(vgsales_union)
print(na_ratio, meta_user_diff)

review_scores = pd.DataFrame({
    "values": np.concatenate([
        vgsales_union["meta_score"].dropna().values,
        vgsales_union["user_score"].dropna().values * 10
    ]),
    "group": ["meta_score"] * vgsales_union["meta_score"].dropna().shape[0] + ["user_score"] * vgsales_union["user_score"].dropna().shape[0]
})
sns.histplot(data=review_scores, x="values", hue="group", element="step", alpha=0.2)
plt.title("Review Scores Distribution")
plt.savefig("./images/review_scores_histogram.png")
plt.clf()

# Boxplots for Scores
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.boxplot(data=vgsales_union, y="meta_score", ax=axes[0])
axes[0].set_title("Meta Score Boxplot")
sns.boxplot(data=vgsales_union, y="user_score", ax=axes[1])
axes[1].set_title("User Score Boxplot")
plt.savefig("./images/Scores_Boxplot.png")
plt.clf()

# Sales without Wii Sports
filtered_data = vgsales_intersect.query("game_title != 'Wii Sports'")
sns.scatterplot(data=filtered_data, x="na_sales", y="total_sales")
plt.title("NA Sales vs Total Sales")
plt.savefig("./images/na_sales_vs_total_sales.png")
plt.clf()

# Top Sales Correlation
top_sales = vgsales_intersect.nlargest(500, "total_sales")
print(top_sales["meta_score"].corr(top_sales["total_sales"]))

# Grouping by Release Date
vgsales_union["release_date"] = pd.to_datetime(
    vgsales_union["release_date"].apply(lambda x: f"01 {x}"), format="%d %B %Y", errors="coerce"
)
by_month = vgsales_union.groupby("release_date").agg(
    total_sales=pd.NamedAgg(column="total_sales", aggfunc="sum"),
    na_sales=pd.NamedAgg(column="na_sales", aggfunc="sum"),
    user_score=pd.NamedAgg(column="user_score", aggfunc="mean"),
    meta_score=pd.NamedAgg(column="meta_score", aggfunc="mean"),
    number_of_games=pd.NamedAgg(column="release_date", aggfunc="count")
).reset_index()

# Sales and Games by Month
p1 = sns.scatterplot(data=by_month, x="release_date", y="number_of_games")
p1.set(title="Number of Games Released", xlabel="Release Date", ylabel="Number of Games")
p1.figure.savefig("./images/games_by_month.png")
p1.figure.clf()

p2 = sns.scatterplot(data=by_month, x="release_date", y="total_sales")
p2.set(title="Total Sales by Month", xlabel="Release Date", ylabel="Total Sales")
p2.figure.savefig("./images/total_sales_by_month.png")
p2.figure.clf()

# Review Scores over Time
sns.scatterplot(data=by_month, x="release_date", y="meta_score", label="Meta Score", color="red")
sns.scatterplot(data=by_month, x="release_date", y="user_score", label="User Score", color="blue")
plt.title("Ratings of Video Games over Time")
plt.xlabel("Release Date")
plt.ylabel("Scores")
plt.legend()
plt.savefig("./images/review_scores_by_month.png")
plt.clf()

# Reset Release Date Format
vgsales_union["release_date"] = vgsales_union["release_date"].dt.strftime("%B %Y")



FileNotFoundError: [Errno 2] No such file or directory: 'VGSales_Union.csv'