In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load both datasets
full_dataset = pd.read_csv("games_march2025_cleaned.csv")  # The original 90k dataset
filtered_dataset = pd.read_csv("Proyecto_ML.csv")  # Your filtered 7k dataset

# Compare distributions of key metrics
metrics = ["Copies Sold", "Wishlists", "bayesian_score", "Price"]


In [5]:
# Load datasets
original_df = pd.read_csv("games_march2025_cleaned.csv")  # Original 90k dataset
filtered_df = pd.read_csv("Proyecto_ML.csv")      # Filtered 7k dataset

# Create a flag for games that made it to the filtered dataset
original_df['in_filtered_set'] = original_df['appid'].isin(filtered_df['appid']).astype(int)

In [None]:



original_df = pd.read_csv("Cleancsv/gamalytics_data.csv")  # Original 90k dataset
filtered_df = pd.read_csv("Proyecto_ML.csv")      # Filtered 7k dataset

# Create a flag for games that made it to the filtered dataset
original_df['in_filtered_set'] = original_df['Steam Id'].isin(filtered_df['appid']).astype(int)

# Count how many games made it to the filtered set
filtered_count = original_df['in_filtered_set'].sum()
total_count = len(original_df)
percentage = (filtered_count / total_count) * 100

print(f"Out of {total_count} games in the original dataset, {filtered_count} ({percentage:.2f}%) made it to the filtered dataset")

# List of metrics to analyze
metrics = ["Copies Sold", "Wishlists", "Price", "Followers"]

# Create a multi-panel figure
fig, axs = plt.subplots(len(metrics), 2, figsize=(15, 5*len(metrics)))

for i, metric in enumerate(metrics):
    if metric in original_df.columns:
        # Split the data
        in_filtered = original_df[original_df['in_filtered_set'] == 1][metric].dropna()
        not_filtered = original_df[original_df['in_filtered_set'] == 0][metric].dropna()
        
        # Print basic statistics
        print(f"\n--- {metric} ---")
        print(f"Games in filtered set: {len(in_filtered)} non-null values")
        print(f"Games not in filtered set: {len(not_filtered)} non-null values")
        
        # Calculate percentiles
        percentiles = [10, 25, 50, 75, 90, 95, 99]
        in_filtered_percentiles = np.percentile(in_filtered, percentiles)
        not_filtered_percentiles = np.percentile(not_filtered, percentiles)
        
        print("\nPercentile comparison:")
        print(f"{'Percentile':10} {'Not Filtered':15} {'Filtered':15} {'Ratio':10}")
        for j, p in enumerate(percentiles):
            ratio = in_filtered_percentiles[j] / not_filtered_percentiles[j] if not_filtered_percentiles[j] != 0 else float('inf')
            print(f"{p:10}th {not_filtered_percentiles[j]:15.2f} {in_filtered_percentiles[j]:15.2f} {ratio:10.2f}x")
        
        # Histograms - left column is normal scale, right column is log scale
        # Normal scale
        sns.histplot(not_filtered, color='blue', alpha=0.5, label='Not Filtered', ax=axs[i, 0])
        sns.histplot(in_filtered, color='red', alpha=0.5, label='Filtered', ax=axs[i, 0])
        axs[i, 0].set_title(f"{metric} Distribution")
        axs[i, 0].legend()
        
        # Log scale (for better visibility of differences)
        sns.histplot(not_filtered, color='blue', alpha=0.5, label='Not Filtered', ax=axs[i, 1])
        sns.histplot(in_filtered, color='red', alpha=0.5, label='Filtered', ax=axs[i, 1])
        axs[i, 1].set_title(f"{metric} Distribution (Log Y Scale)")
        axs[i, 1].set_yscale('log')
        axs[i, 1].legend()
        
        # Add text showing the median and 75th percentile values
        axs[i, 0].text(0.05, 0.95, 
                      f"Median (Not Filtered): {not_filtered_percentiles[2]:.2f}\nMedian (Filtered): {in_filtered_percentiles[2]:.2f}\nRatio: {ratio:.2f}x", 
                      transform=axs[i, 0].transAxes, 
                      verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Tidy up the layout
plt.tight_layout()
plt.savefig("filtered_vs_unfiltered_comparison.png", dpi=300)
plt.show()

# Additional analysis: Where does the worst game in the filtered set rank in the overall distribution?
print("\n--- Where does the filtered set fall in the overall distribution? ---")
for metric in metrics:
    if metric in original_df.columns:
        # Get minimum value in the filtered set
        min_val_filtered = original_df[original_df['in_filtered_set'] == 1][metric].min()
        
        # Calculate what percentile this is in the overall distribution
        percentile_rank = np.mean(original_df[metric] < min_val_filtered) * 100
        
        print(f"Minimum {metric} in filtered set ({min_val_filtered:.2f}) is at the {percentile_rank:.2f}th percentile of the overall dataset")
        
        # Calculate what percentile the 25th percentile of the filtered set is in the overall distribution
        p25_filtered = np.percentile(original_df[original_df['in_filtered_set'] == 1][metric].dropna(), 25)
        p25_rank = np.mean(original_df[metric] < p25_filtered) * 100
        
        print(f"The 25th percentile of {metric} in filtered set ({p25_filtered:.2f}) is at the {p25_rank:.2f}th percentile of the overall dataset")

# Additional visualization: CDF comparison
plt.figure(figsize=(12, 6))
for metric in metrics[:2]:  # Just do the first two metrics to avoid too many plots
    if metric in original_df.columns:
        plt.figure(figsize=(10, 6))
        
        # Get data
        in_filtered = original_df[original_df['in_filtered_set'] == 1][metric].dropna()
        not_filtered = original_df[original_df['in_filtered_set'] == 0][metric].dropna()
        
        # Sort the data
        in_filtered_sorted = np.sort(in_filtered)
        not_filtered_sorted = np.sort(not_filtered)
        
        # Compute CDFs
        in_filtered_cdf = np.arange(1, len(in_filtered_sorted) + 1) / len(in_filtered_sorted)
        not_filtered_cdf = np.arange(1, len(not_filtered_sorted) + 1) / len(not_filtered_sorted)
        
        # Plot CDFs
        plt.plot(not_filtered_sorted, not_filtered_cdf, label='Not Filtered', color='blue')
        plt.plot(in_filtered_sorted, in_filtered_cdf, label='Filtered', color='red')
        plt.xscale('log')  # Log scale for x-axis to better see distribution
        plt.title(f"Cumulative Distribution Function: {metric}")
        plt.xlabel(metric)
        plt.ylabel("Cumulative Probability")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.savefig(f"cdf_{metric}.png", dpi=300)
        plt.close()

  original_df = pd.read_csv("Cleancsv/gamalytics_data.csv")  # Original 90k dataset


Out of 98350 games in the original dataset, 7029 (7.15%) made it to the filtered dataset

--- Copies Sold ---
Games in filtered set: 7029 non-null values
Games not in filtered set: 91321 non-null values

Percentile comparison:
Percentile Not Filtered    Filtered        Ratio     
        10th           15.00         4841.00     322.73x
        25th           55.00        11085.00     201.55x
        50th          390.00        32976.00      84.55x
        75th         2607.00       133840.00      51.34x
        90th        16591.00       632856.00      38.14x
        95th        55813.00      1559837.00      27.95x
        99th       639865.60      7136285.28      11.15x

--- Wishlists ---
Games in filtered set: 7029 non-null values
Games not in filtered set: 91243 non-null values

Percentile comparison:
Percentile Not Filtered    Filtered        Ratio     
        10th          204.00         2200.00      10.78x
        25th          732.00         5500.00       7.51x
        50th    