In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

# Set style for presentation-quality graphs
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.size'] = 12
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12

# Load datasets
original_df = pd.read_csv("Cleancsv/gamalytics_data.csv")  # Original 90k dataset
filtered_df = pd.read_csv("Proyecto_ML.csv")              # Filtered 7k dataset

# Create a flag for games that made it to the filtered dataset
original_df['in_filtered_set'] = original_df['Steam Id'].isin(filtered_df['appid']).astype(int)

# Distribution Comparison (Copies Sold)
plt.figure(figsize=(12, 8))

# Get data for plotting
in_filtered = original_df[original_df['in_filtered_set'] == 1]['Copies Sold'].dropna()
not_filtered = original_df[original_df['in_filtered_set'] == 0]['Copies Sold'].dropna()

# Cap at 99th percentile for better visualization
filtered_max = np.percentile(in_filtered, 99)
unfiltered_max = np.percentile(not_filtered, 99)
plot_max = max(filtered_max, unfiltered_max)

filtered_plot = in_filtered[in_filtered <= plot_max]
unfiltered_plot = not_filtered[not_filtered <= plot_max]

# Create log-spaced bins
bins = np.logspace(np.log10(max(1, min(unfiltered_plot.min(), filtered_plot.min()))), 
                 np.log10(plot_max), 
                 50)

# Plot histograms
plt.hist(unfiltered_plot, bins=bins, color='#4472C4', alpha=0.7, label='Not in Filtered Dataset')
plt.hist(filtered_plot, bins=bins, color='#ED7D31', alpha=0.7, label='In Filtered Dataset')

plt.xscale('log')
plt.title('Distribution of Copies Sold (Capped at 99th Percentile)', fontweight='bold')
plt.xlabel('Copies Sold (Log Scale)')
plt.ylabel('Number of Games')
plt.legend(frameon=True)
plt.tight_layout()
plt.savefig("copies_sold_distribution.png", dpi=300, bbox_inches='tight')
plt.close()

# CDF Plot
plt.figure(figsize=(12, 8))

# Sort the data
filtered_sorted = np.sort(in_filtered)
unfiltered_sorted = np.sort(not_filtered)

# Compute CDFs
filtered_cdf = np.arange(1, len(filtered_sorted) + 1) / len(filtered_sorted)
unfiltered_cdf = np.arange(1, len(unfiltered_sorted) + 1) / len(unfiltered_sorted)

# Plot CDFs
plt.plot(unfiltered_sorted, unfiltered_cdf, label='Not in Filtered Dataset', 
         color='#4472C4', linewidth=3)
plt.plot(filtered_sorted, filtered_cdf, label='In Filtered Dataset', 
         color='#ED7D31', linewidth=3)

plt.xscale('log')
plt.grid(True, alpha=0.3)
plt.title('Cumulative Distribution: Copies Sold', fontweight='bold')
plt.xlabel('Copies Sold (Log Scale)')
plt.ylabel('Cumulative Probability')
plt.legend(loc='lower right', frameon=True)

# Add vertical lines at key percentiles
for p, label in [(0.25, '25th'), (0.5, '50th'), (0.75, '75th')]:
    filtered_val = np.percentile(filtered_sorted, p*100)
    unfiltered_val = np.percentile(unfiltered_sorted, p*100)
    
    plt.axvline(x=filtered_val, color='#ED7D31', linestyle='--', alpha=0.7)
    plt.axvline(x=unfiltered_val, color='#4472C4', linestyle='--', alpha=0.7)
    
    # Add text annotations
    plt.text(filtered_val*1.1, p-0.05, f"{label} percentile: {filtered_val:,.0f}", 
             color='#ED7D31', fontweight='bold')
    plt.text(unfiltered_val*1.1, p+0.05, f"{label} percentile: {unfiltered_val:,.0f}", 
             color='#4472C4', fontweight='bold')

plt.tight_layout()
plt.savefig("copies_sold_cdf.png", dpi=300, bbox_inches='tight')
plt.close()

  original_df = pd.read_csv("Cleancsv/gamalytics_data.csv")  # Original 90k dataset


In [3]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Create a directed graph
G = nx.DiGraph()

# Add nodes
nodes = [
    "Original Dataset\n(98,350 games)",
    "Filter for\nSingle-player games",
    "Filter for\nmin 50 reviews",
    "Merge with\nSteam API data",
    "Handle missing\nengagement metrics",
    "Filtered Dataset\n(7,029 games)",
    "Feature Engineering",
    "Train/Test Split",
    "Model Training",
    "Evaluation"
]

# Define positions manually for better control
pos = {
    nodes[0]: (0, 0),
    nodes[1]: (0, -1),
    nodes[2]: (0, -2),
    nodes[3]: (0, -3),
    nodes[4]: (0, -4),
    nodes[5]: (0, -5),
    nodes[6]: (1, -5),
    nodes[7]: (2, -5),
    nodes[8]: (3, -5),
    nodes[9]: (4, -5),
}

# Add nodes with positions
for node in nodes:
    G.add_node(node)

# Add edges
edges = [
    (nodes[0], nodes[1]),
    (nodes[1], nodes[2]),
    (nodes[2], nodes[3]),
    (nodes[3], nodes[4]),
    (nodes[4], nodes[5]),
    (nodes[5], nodes[6]),
    (nodes[6], nodes[7]),
    (nodes[7], nodes[8]),
    (nodes[8], nodes[9]),
]

G.add_edges_from(edges)

# Draw the graph
plt.figure(figsize=(15, 10))
nx.draw_networkx(
    G, pos,
    node_color="#C5E0B4",
    node_size=5000,
    font_size=13,
    font_weight="bold",
    arrowsize=20,
    width=2,
    edge_color="#70AD47",
    arrows=True,
)

plt.axis('off')
plt.tight_layout()
plt.savefig("data_processing_flow.png", dpi=300, bbox_inches='tight')
plt.close()

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("Proyecto_ML.csv")

# Correlation Heatmap
# Select relevant columns for correlation
correlation_cols = [
    'time_to_beat', 'Followers', 'engagement_ratio', 'Price', 
    'Wishlists', 'bayesian_score', 'Copies Sold'
]

# Create correlation matrix
corr_matrix = df[correlation_cols].corr()

# Create heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
    corr_matrix, 
    mask=mask,
    cmap=cmap, 
    vmax=1, 
    vmin=-1, 
    center=0,
    square=True, 
    linewidths=.5, 
    annot=True, 
    fmt=".2f",
    annot_kws={'size': 12},
    cbar_kws={"shrink": .8}
)

plt.title('Feature Correlation Heatmap', fontsize=18, fontweight='bold')
plt.tight_layout()
plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches='tight')
plt.close()

# Engagement Metrics Scatter Plot
plt.figure(figsize=(12, 8))

# Since Publishers Class is already one-hot encoded, we need to create a column that indicates the class
publisher_columns = ['Publishers Class_AA', 'Publishers Class_AAA', 
                    'Publishers Class_Hobbyist', 'Publishers Class_Indie']

# Create a temporary column that indicates the publisher class
df['temp_publisher_class'] = 'Unknown'
for col in publisher_columns:
    class_name = col.replace('Publishers Class_', '')
    df.loc[df[col] == 1, 'temp_publisher_class'] = class_name

# Create a publisher class color map
color_map = {
    'AAA': '#FF6B6B', 
    'AA': '#4ECDC4', 
    'Indie': '#45B7D1',
    'Hobbyist': '#98D560',
    'Unknown': '#999999'
}

# Create scatter plot
for pub_class in df['temp_publisher_class'].unique():
    subset = df[df['temp_publisher_class'] == pub_class]
    plt.scatter(
        subset['time_to_beat'], 
        subset['average_playtime_forever'] / 60 if 'average_playtime_forever' in df.columns else subset['time_to_beat'],  # Fallback if column doesn't exist
        alpha=0.7,
        label=pub_class,
        color=color_map.get(pub_class, '#999999'),
        s=60,
        edgecolor='w',
        linewidth=0.5
    )

# Add reference lines for engagement ratios
max_time = df['time_to_beat'].max() * 1.1
x = np.linspace(0, max_time, 100)

for ratio, style in [(0.5, '--'), (1.0, '-'), (2.0, '--'), (5.0, ':')]:
    plt.plot(x, ratio * x, linestyle=style, color='#888888', alpha=0.7, 
             label=f'Engagement Ratio = {ratio}')

plt.xlabel('Time to Beat (hours)')
plt.ylabel('Average Playtime (hours)')
plt.title('Relationship Between Game Length and Player Engagement', fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(frameon=True)

# Set reasonable axis limits
plt.xlim(0, min(100, df['time_to_beat'].quantile(0.95)))

# Make sure we have the average_playtime_forever column, otherwise use time_to_beat
if 'average_playtime_forever' in df.columns:
    plt.ylim(0, min(200, (df['average_playtime_forever'] / 60).quantile(0.95)))
else:
    # Fallback to a reasonable multiple of time_to_beat
    plt.ylim(0, min(200, df['time_to_beat'].quantile(0.95) * 5))

plt.tight_layout()
plt.savefig("engagement_scatter.png", dpi=300, bbox_inches='tight')
plt.close()

# Clean up the temporary column
df = df.drop('temp_publisher_class', axis=1)

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load the trained model
model = joblib.load("catboost_model_Copies Sold.pkl")

# Feature importance visualization
# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': model.feature_names_,
    'Importance': model.get_feature_importance()
})

# Sort by importance
feature_importance = feature_importance.sort_values('Importance', ascending=False).head(15)

# Define color map by feature type
def get_feature_color(feature_name):
    if feature_name.startswith('Tags_'):
        return '#5B9BD5'  # Blue for Tags
    elif feature_name.startswith('genres_'):
        return '#ED7D31'  # Orange for Genres
    elif feature_name.startswith('categories_'):
        return '#A5A5A5'  # Gray for Categories
    elif feature_name.startswith('Publishers Class_'):
        return '#FFC000'  # Yellow for Publisher Class
    else:
        return '#70AD47'  # Green for other features

# Map colors to features
feature_colors = [get_feature_color(feature) for feature in feature_importance['Feature']]

# Create horizontal bar chart
plt.figure(figsize=(12, 10))
bars = plt.barh(
    feature_importance['Feature'],
    feature_importance['Importance'],
    color=feature_colors,
    edgecolor='white',
    linewidth=1
)

# Add value labels to the right of each bar
for i, (importance, feature) in enumerate(zip(feature_importance['Importance'], feature_importance['Feature'])):
    plt.text(
        importance + importance * 0.01,
        i,
        f"{importance:.4f}",
        va='center',
        fontsize=10
    )

plt.xlabel('Importance')
plt.title('Feature Importance for Copies Sold Prediction', fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()

# Add a legend for feature types
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#5B9BD5', label='Tags'),
    Patch(facecolor='#ED7D31', label='Genres'),
    Patch(facecolor='#A5A5A5', label='Categories'),
    Patch(facecolor='#FFC000', label='Publisher Class'),
    Patch(facecolor='#70AD47', label='Other Features')
]
plt.legend(handles=legend_elements, loc='lower right')

plt.savefig("feature_importance.png", dpi=300, bbox_inches='tight')
plt.close()

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create a visual representation of percentile differences
plt.figure(figsize=(14, 8))

# Define the percentiles to visualize
percentiles = [25, 50, 75]

# Data for Copies Sold - from your analysis
not_filtered_values = [55, 390, 2607]
filtered_values = [11085, 32976, 133840]
ratios = [filtered_values[i]/not_filtered_values[i] for i in range(len(percentiles))]

# Create positions on x-axis
x = np.arange(len(percentiles))
width = 0.35

# Create the grouped bar chart
ax = plt.subplot(1, 1, 1)

# Plot bars
bars1 = ax.bar(x - width/2, not_filtered_values, width, label='Not Filtered', color='#4472C4')
bars2 = ax.bar(x + width/2, filtered_values, width, label='Filtered', color='#ED7D31')

# Add labels
ax.set_ylabel('Copies Sold', fontsize=14)
ax.set_title('Comparison of Copies Sold Percentiles', fontweight='bold', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels([f'{p}th Percentile' for p in percentiles])
ax.legend()

# Use log scale for y-axis
ax.set_yscale('log')

# Add value annotations and ratio text
for i, (bar1, bar2, ratio) in enumerate(zip(bars1, bars2, ratios)):
    # Format values differently based on magnitude
    if bar1.get_height() < 1000:
        height1_text = f"{bar1.get_height():.0f}"
    else:
        height1_text = f"{bar1.get_height():,.0f}"
        
    if bar2.get_height() < 1000:
        height2_text = f"{bar2.get_height():.0f}"
    else:
        height2_text = f"{bar2.get_height():,.0f}"
    
    # Add text for non-filtered dataset
    ax.text(bar1.get_x() + bar1.get_width()/2, bar1.get_height()*1.1,
            height1_text, ha='center', va='bottom', color='#4472C4', fontweight='bold')
    
    # Add text for filtered dataset
    ax.text(bar2.get_x() + bar2.get_width()/2, bar2.get_height()*1.1,
            height2_text, ha='center', va='bottom', color='#ED7D31', fontweight='bold')
    
    # Add ratio text between bars
    y_pos = np.sqrt(bar1.get_height() * bar2.get_height())  # Geometric mean for positioning
    ax.text(i, y_pos, f"{ratio:.1f}x", ha='center', va='center', 
           bbox=dict(boxstyle="round,pad=0.3", fc='#FFF2CC', ec='#FFD966'),
           fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig("percentile_comparison.png", dpi=300, bbox_inches='tight')
plt.close()

In [9]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path

# Create a visual representation of the two-stage model architecture
fig, ax = plt.subplots(figsize=(14, 10))

# Define colors
color_stage1 = '#5B9BD5'  # Blue
color_stage2 = '#ED7D31'  # Orange
color_input = '#70AD47'   # Green
color_output = '#FFC000'  # Yellow
color_arrow = '#A5A5A5'   # Gray

# Function to create boxes with rounded corners
def create_box(x, y, width, height, color, label, fontsize=12, fontweight='normal'):
    rect = patches.FancyBboxPatch(
        (x, y), width, height, 
        linewidth=2, 
        edgecolor='black',
        facecolor=color, 
        alpha=0.7, 
        zorder=2,
        boxstyle=patches.BoxStyle.Round(pad=0.3)
    )
    ax.add_patch(rect)
    ax.text(x + width/2, y + height/2, label, ha='center', va='center', 
            fontsize=fontsize, fontweight=fontweight, zorder=3)
    return rect

# Function to create arrows
def create_arrow(start_x, start_y, end_x, end_y, color, label=None, label_pos=0.5):
    arrow = patches.FancyArrowPatch((start_x, start_y), (end_x, end_y),
                                  connectionstyle="arc3,rad=0", 
                                  arrowstyle='-|>', linewidth=2,
                                  color=color, zorder=1)
    ax.add_patch(arrow)
    
    if label:
        # Position the label along the arrow
        mid_x = start_x + label_pos * (end_x - start_x)
        mid_y = start_y + label_pos * (end_y - start_y)
        
        ax.text(mid_x, mid_y, label, ha='center', va='center',
                fontsize=10, backgroundcolor='white', zorder=3)

# Create input box
create_box(1, 9, 3, 1, color_input, "Game Features\n(500+ characteristics)",
          fontsize=12, fontweight='bold')

# Create Stage 1 boxes
create_box(1, 6, 3, 1.5, color_stage1, "STAGE 1\nClassification Model", 
          fontsize=14, fontweight='bold')
create_box(1, 4.5, 3, 1, color_stage1, "Will game be in top 7%?",
          fontsize=12)

# Create conditional branching
create_box(0, 3, 1.5, 1, color_stage1, "No", fontsize=12)
create_box(3.5, 3, 1.5, 1, color_stage1, "Yes", fontsize=12)

# Create Stage 2 box
create_box(3, 1.5, 3, 1, color_stage2, "STAGE 2\nRegression Model",
          fontsize=14, fontweight='bold')

# Create output boxes
create_box(0, 0.5, 2, 1, color_output, "General Market\nPrediction", fontsize=12)
create_box(3, 0, 3, 1, color_output, "Precise Metrics:\nCopies Sold, Wishlists, Score", fontsize=12)

# Create connections
create_arrow(2.5, 9, 2.5, 7.5, color_arrow)
create_arrow(2.5, 6, 2.5, 5.5, color_arrow)
create_arrow(2.5, 4.5, 2.5, 4, color_arrow)

# Branching arrows
create_arrow(2.5, 4, 0.75, 3, color_arrow)
create_arrow(2.5, 4, 4.25, 3, color_arrow)

# Final arrows
create_arrow(0.75, 3, 1, 1.5, color_arrow)
create_arrow(4.25, 3, 4.5, 2.5, color_arrow)

# Output arrows
create_arrow(1, 1.5, 1, 0.5, color_arrow)
create_arrow(4.5, 2.5, 4.5, 1, color_arrow)

# Title
plt.text(3.5, 10.5, 'Two-Stage Model Architecture', ha='center', fontsize=18, fontweight='bold')

# Explanation text
explanation = (
    "This approach addresses dataset bias by first classifying if a game\n"
    "will be successful enough to have engagement metrics (Stage 1),\n"
    "then predicting specific performance metrics only for likely\n"
    "successful games (Stage 2)."
)
plt.text(7, 5, explanation, ha='left', va='center', fontsize=12,
        bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))

# Set limits and turn off axis
ax.set_xlim(0, 10)
ax.set_ylim(0, 11)
ax.axis('off')

plt.tight_layout()
plt.savefig("two_stage_model.png", dpi=300, bbox_inches='tight')
plt.close()

In [12]:


# Load datasets
original_df = pd.read_csv("Cleancsv/gamalytics_data.csv")  # Original 90k dataset 
filtered_df = pd.read_csv("Proyecto_ML.csv")               # Filtered 7k dataset

# Create a flag for games that made it to the filtered dataset
original_df['in_filtered_set'] = original_df['Steam Id'].isin(filtered_df['appid']).astype(int)

# Define the metrics to visualize
metrics = ["Copies Sold", "Wishlists", "Review Score"]  # Using Review Score instead of bayesian_score if it's available

# Create a dataframe to store min/max values
results = []

for metric in metrics:
    if metric in original_df.columns:
        # Calculate statistics for filtered set
        filtered_data = original_df[original_df['in_filtered_set'] == 1][metric].dropna()
        filtered_min = filtered_data.min()
        filtered_max = filtered_data.max()
        filtered_median = filtered_data.median()
        
        # Calculate statistics for unfiltered set
        unfiltered_data = original_df[original_df['in_filtered_set'] == 0][metric].dropna()
        unfiltered_min = unfiltered_data.min()
        unfiltered_max = unfiltered_data.max()
        unfiltered_median = unfiltered_data.median()
        
        # Add to results
        results.append({
            'Metric': metric,
            'Dataset': 'Filtered',
            'Minimum': filtered_min,
            'Median': filtered_median,
            'Maximum': filtered_max
        })
        
        results.append({
            'Metric': metric,
            'Dataset': 'Unfiltered',
            'Minimum': unfiltered_min,
            'Median': unfiltered_median,
            'Maximum': unfiltered_max
        })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Print the results table
print(results_df)

# Create separate visualizations for each metric to handle different scales better
for metric in metrics:
    if metric not in original_df.columns:
        continue
    
    plt.figure(figsize=(12, 8))
    
    # Extract data for this metric
    metric_data = results_df[results_df['Metric'] == metric]
    
    # Create positions for the bars
    x = np.arange(3)  # min, median, max
    width = 0.35
    
    # Get filtered and unfiltered data
    filtered = metric_data[metric_data['Dataset'] == 'Filtered']
    unfiltered = metric_data[metric_data['Dataset'] == 'Unfiltered']
    
    # Extract the values
    filtered_values = [filtered['Minimum'].values[0], filtered['Median'].values[0], filtered['Maximum'].values[0]]
    unfiltered_values = [unfiltered['Minimum'].values[0], unfiltered['Median'].values[0], unfiltered['Maximum'].values[0]]
    
    # Check if we need log scale
    use_log_scale = (metric in ["Copies Sold", "Wishlists"])
    
    # For Review Score, calculate min values as percentage of max
    if metric == "Review Score":
        # Use linear scale for Review Score
        use_log_scale = False
        # Ensure minimum values are clearly visible with a linear scale
        bars1 = plt.bar(x - width/2, filtered_values, width, label='Filtered Dataset (7k games)', color='#ED7D31')
        bars2 = plt.bar(x + width/2, unfiltered_values, width, label='Unfiltered Dataset (90k games)', color='#4472C4')
    else:
        # For metrics with large ranges, use log scale
        if use_log_scale:
            # Add a small offset to zero values for log scale
            filtered_values = [max(0.1, val) for val in filtered_values]
            unfiltered_values = [max(0.1, val) for val in unfiltered_values]
            plt.yscale('log')
        
        bars1 = plt.bar(x - width/2, filtered_values, width, label='Filtered Dataset (7k games)', color='#ED7D31')
        bars2 = plt.bar(x + width/2, unfiltered_values, width, label='Unfiltered Dataset (90k games)', color='#4472C4')
    
    # Add labels and title
    plt.title(f'{metric} Range Comparison', fontsize=16, fontweight='bold')
    plt.xticks(x, ['Minimum', 'Median', 'Maximum'], fontsize=14)
    plt.ylabel(metric, fontsize=14)
    plt.grid(axis='y', alpha=0.3)
    plt.legend(fontsize=12)
    
    # Add value labels to the bars
    def add_value_labels(bars, values):
        for bar, value in zip(bars, values):
            height = bar.get_height()
            if metric in ["Copies Sold", "Wishlists"]:
                if value >= 1000000:
                    label = f'{value/1000000:.1f}M'
                elif value >= 1000:
                    label = f'{value/1000:.1f}K'
                else:
                    label = f'{value:.1f}'
            else:
                label = f'{value:.1f}'
                
            plt.text(bar.get_x() + bar.get_width()/2., height * 1.05,
                    label, ha='center', va='bottom', fontweight='bold')
    
    # Add value labels for both bar sets
    add_value_labels(bars1, filtered_values)  # Filtered dataset
    add_value_labels(bars2, unfiltered_values)  # Unfiltered dataset
    
    # Add a subtitle with the ratio info
    min_ratio = filtered['Minimum'].values[0] / max(1, unfiltered['Minimum'].values[0])
    median_ratio = filtered['Median'].values[0] / max(1, unfiltered['Median'].values[0])
    max_ratio = filtered['Maximum'].values[0] / max(1, unfiltered['Maximum'].values[0])
    
    subtitle = f"Ratios (Filtered/Unfiltered): Minimum: {min_ratio:.1f}x  |  Median: {median_ratio:.1f}x  |  Maximum: {max_ratio:.1f}x"
    plt.figtext(0.5, 0.01, subtitle, ha='center', fontsize=12, fontweight='bold')
    
    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15)  # Make room for the subtitle
    
    # Save the figure
    plt.savefig(f'{metric.replace(" ", "_")}_range_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

# Create a combined visualization with all metrics in separate subplots
plt.figure(figsize=(18, 15))

for i, metric in enumerate(metrics):
    if metric not in original_df.columns:
        continue
        
    plt.subplot(len(metrics), 1, i+1)
    
    # Extract data for this metric
    metric_data = results_df[results_df['Metric'] == metric]
    
    # Create positions for the bars
    x = np.arange(3)  # min, median, max
    width = 0.35
    
    # Get filtered and unfiltered data
    filtered = metric_data[metric_data['Dataset'] == 'Filtered']
    unfiltered = metric_data[metric_data['Dataset'] == 'Unfiltered']
    
    # Extract the values
    filtered_values = [filtered['Minimum'].values[0], filtered['Median'].values[0], filtered['Maximum'].values[0]]
    unfiltered_values = [unfiltered['Minimum'].values[0], unfiltered['Median'].values[0], unfiltered['Maximum'].values[0]]
    
    # For metrics with large ranges, use log scale
    if metric in ["Copies Sold", "Wishlists"]:
        # Add a small offset to zero values for log scale
        filtered_values = [max(0.1, val) for val in filtered_values]
        unfiltered_values = [max(0.1, val) for val in unfiltered_values]
        plt.yscale('log')
    
    # Create the bars
    bars1 = plt.bar(x - width/2, filtered_values, width, label='Filtered Dataset (7k games)', color='#ED7D31')
    bars2 = plt.bar(x + width/2, unfiltered_values, width, label='Unfiltered Dataset (90k games)', color='#4472C4')
    
    # Add labels and title
    plt.title(f'{metric} Range Comparison', fontsize=16, fontweight='bold')
    plt.xticks(x, ['Minimum', 'Median', 'Maximum'], fontsize=14)
    plt.ylabel(metric, fontsize=14)
    plt.grid(axis='y', alpha=0.3)
    
    # Only add legend to the first subplot
    if i == 0:
        plt.legend(fontsize=12)
    
    # Add value labels to the bars
    def add_value_labels(bars, values):
        for bar, value in zip(bars, values):
            height = bar.get_height()
            if metric in ["Copies Sold", "Wishlists"]:
                if value >= 1000000:
                    label = f'{value/1000000:.1f}M'
                elif value >= 1000:
                    label = f'{value/1000:.1f}K'
                else:
                    label = f'{value:.1f}'
            else:
                label = f'{value:.1f}'
                
            plt.text(bar.get_x() + bar.get_width()/2., height * 1.05,
                    label, ha='center', va='bottom', fontweight='bold')
    
    # Add value labels for both bar sets
    add_value_labels(bars1, filtered_values)  # Filtered dataset
    add_value_labels(bars2, unfiltered_values)  # Unfiltered dataset

# Add overall title
plt.suptitle('Dataset Range Comparison: Filtered vs. Unfiltered', 
             fontsize=20, fontweight='bold', y=0.98)

# Add explanation text
explanation = (
    "This visualization illustrates the extreme difference in value ranges between the filtered dataset (7,029 games) "
    "and the unfiltered dataset (90,000+ games). Note that even the minimum values in the filtered dataset are "
    "substantially higher than typical values in the unfiltered dataset, demonstrating the need for a two-model "
    "approach to accurately handle the full spectrum of games."
)

plt.figtext(0.5, 0.01, explanation, wrap=True, horizontalalignment='center', fontsize=14)

# Adjust layout
plt.tight_layout()
plt.subplots_adjust(hspace=0.4, bottom=0.1)

# Save the combined figure
plt.savefig('dataset_range_comparison_combined.png', dpi=300, bbox_inches='tight')
plt.close()

# Create a table with the actual values
fig, ax = plt.figure(figsize=(12, 6), constrained_layout=True)
ax = plt.subplot(111)
ax.axis('off')

# Create the table data
table_data = []
headers = ['Metric', 'Dataset', 'Minimum', 'Median', 'Maximum', 'Min Ratio', 'Med Ratio', 'Max Ratio']
for metric in metrics:
    if metric not in original_df.columns:
        continue
        
    # Extract data for this metric
    metric_data = results_df[results_df['Metric'] == metric]
    
    # Get filtered and unfiltered data
    filtered = metric_data[metric_data['Dataset'] == 'Filtered']
    unfiltered = metric_data[metric_data['Dataset'] == 'Unfiltered']
    
    # Calculate ratios
    min_ratio = filtered['Minimum'].values[0] / max(1, unfiltered['Minimum'].values[0])
    median_ratio = filtered['Median'].values[0] / max(1, unfiltered['Median'].values[0])
    max_ratio = filtered['Maximum'].values[0] / max(1, unfiltered['Maximum'].values[0])
    
    # Format the values
    if metric in ["Copies Sold", "Wishlists"]:
        f_min = f"{filtered['Minimum'].values[0]:,.0f}"
        f_med = f"{filtered['Median'].values[0]:,.0f}" 
        f_max = f"{filtered['Maximum'].values[0]:,.0f}"
        u_min = f"{unfiltered['Minimum'].values[0]:,.0f}"
        u_med = f"{unfiltered['Median'].values[0]:,.0f}"
        u_max = f"{unfiltered['Maximum'].values[0]:,.0f}"
    else:
        f_min = f"{filtered['Minimum'].values[0]:.1f}"
        f_med = f"{filtered['Median'].values[0]:.1f}"
        f_max = f"{filtered['Maximum'].values[0]:.1f}"
        u_min = f"{unfiltered['Minimum'].values[0]:.1f}"
        u_med = f"{unfiltered['Median'].values[0]:.1f}"
        u_max = f"{unfiltered['Maximum'].values[0]:.1f}"
    
    # Add to table data
    table_data.append([metric, 'Filtered', f_min, f_med, f_max, '', '', ''])
    table_data.append([metric, 'Unfiltered', u_min, u_med, u_max, f"{min_ratio:.1f}x", f"{median_ratio:.1f}x", f"{max_ratio:.1f}x"])

# Create the table
table = plt.table(cellText=table_data,
          colLabels=headers,
          loc='center',
          cellLoc='center',
          colWidths=[0.15, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12])

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1, 1.5)

# Color the header row
for i, key in enumerate(headers):
    table[(0, i)].set_facecolor('#4472C4')
    table[(0, i)].set_text_props(color='white', fontweight='bold')

# Color the ratio cells
for row in range(len(table_data)):
    if row % 2 == 1:  # Unfiltered rows
        for col in range(5, 8):  # Ratio columns
            table[(row+1, col)].set_facecolor('#FFC000')
            table[(row+1, col)].set_text_props(fontweight='bold')

plt.title('Dataset Value Comparison: Filtered vs. Unfiltered', fontsize=16, fontweight='bold')
plt.savefig('dataset_value_table.png', dpi=300, bbox_inches='tight')
plt.close()

  original_df = pd.read_csv("Cleancsv/gamalytics_data.csv")  # Original 90k dataset


         Metric     Dataset  Minimum   Median      Maximum
0   Copies Sold    Filtered    563.0  32976.0   52847649.0
1   Copies Sold  Unfiltered      0.0    390.0  302975930.0
2     Wishlists    Filtered    204.0  16500.0    4673900.0
3     Wishlists  Unfiltered     12.0   1100.0    5926800.0
4  Review Score    Filtered      0.0     84.0        100.0
5  Review Score  Unfiltered      0.0     80.0        100.0


TypeError: cannot unpack non-iterable Figure object

<Figure size 1200x600 with 0 Axes>