In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Proyecto_ML.csv")

# Define targets
targets = ["Copies Sold", "Wishlists", "bayesian_score"]

# Define feature set (exclude target variables)
drop_cols = targets + ["appid", "name", "release_date", "developers", "publishers"]
X = df.drop(columns=drop_cols, errors='ignore')
X = X.select_dtypes(include='number')  # Keep only numeric features

In [3]:
plt.figure(figsize=(18, 6))

for i, target in enumerate(targets[:2]):  # Only plot Copies Sold and Wishlists (skip bayesian_score)
    plt.subplot(1, 2, i+1)
    
    # Use log scale to visualize the extreme skew
    sns.histplot(df[target], bins=50, kde=True)
    plt.title(f'Original Distribution: {target}')
    plt.yscale('log')
    plt.xscale('log')
    
plt.tight_layout()
plt.savefig('target_distribution_before.png', dpi=300)
plt.close()

In [4]:
# Define log transformation functions
def log_transform(y):
    return np.log1p(y)  # log1p = log(1+y) to handle zeros

def inverse_log_transform(y):
    return np.expm1(y)  # expm1 = exp(y)-1, inverse of log1p

# Define BoxCox transformations - alternative approach
from scipy import stats

def boxcox_transform(y):
    # Add small constant to ensure all values are positive
    y_positive = y + 1e-10  
    y_transformed, lambda_value = stats.boxcox(y_positive)
    # Store lambda for inverse transform
    boxcox_transform.lambda_value = lambda_value
    return y_transformed

def inverse_boxcox_transform(y):
    # Use stored lambda for inverse transform
    lambda_value = boxcox_transform.lambda_value
    if lambda_value == 0:
        return np.exp(y) - 1e-10
    else:
        return (lambda_value * y + 1) ** (1/lambda_value) - 1e-10

In [5]:
def train_transformed_model(X, y, target_name, transform='log', test_size=0.2, random_state=42):
    """
    Train a CatBoost model with transformed target.
    
    Parameters:
    -----------
    X : DataFrame
        Feature matrix
    y : Series
        Target variable
    target_name : str
        Name of the target variable
    transform : str, default='log'
        Transformation to apply ('log' or 'boxcox')
    test_size : float, default=0.2
        Proportion of the dataset to include in the test split
    random_state : int, default=42
        Controls the shuffling in train/test split
        
    Returns:
    --------
    model : TransformedTargetRegressor
        Trained model with target transformation
    X_test : DataFrame
        Test features
    y_test : Series
        Test target values
    metrics : dict
        Performance metrics
    """
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Choose transformation
    if transform == 'log':
        transformer = FunctionTransformer(log_transform, inverse_func=inverse_log_transform)
    elif transform == 'boxcox':
        transformer = FunctionTransformer(boxcox_transform, inverse_func=inverse_boxcox_transform)
    else:
        raise ValueError("Transform must be 'log' or 'boxcox'")
    
    # Create CatBoost regressor
    regressor = CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        loss_function='RMSE',
        random_seed=random_state,
        verbose=100
    )
    
    # Create TransformedTargetRegressor
    model = TransformedTargetRegressor(
        regressor=regressor,
        transformer=transformer
    )
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions (automatically un-transforms the target)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    metrics = {
        'r2': r2,
        'mae': mae,
        'rmse': rmse
    }
    
    # Print results
    print(f"\nResults for {target_name} with {transform} transformation:")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    
    return model, X_test, y_test, y_pred, metrics

In [6]:
results = {}

# Train models for each target
for target in targets:
    y = df[target]
    
    # Skip targets with non-positive values for BoxCox transformation
    if (y <= 0).any() and target != "bayesian_score":
        print(f"Warning: {target} contains non-positive values. Using log transform only.")
        transforms = ['log']
    else:
        transforms = ['log', 'boxcox']
    
    for transform in transforms:
        # Skip BoxCox for bayesian_score (tends to be more normal already)
        if target == "bayesian_score" and transform == "boxcox":
            continue
            
        model, X_test, y_test, y_pred, metrics = train_transformed_model(
            X, y, target, transform=transform
        )
        
        key = f"{target}_{transform}"
        results[key] = {
            'model': model,
            'X_test': X_test,
            'y_test': y_test,
            'y_pred': y_pred,
            'metrics': metrics
        }
        
        # Save the model
        import joblib
        model_filename = f"transformed_{transform}_{target.replace(' ', '_')}.pkl"
        joblib.dump(model, model_filename)
        print(f"Model saved as {model_filename}\n")

0:	learn: 1.8026347	total: 154ms	remaining: 1m 16s
100:	learn: 0.5951611	total: 612ms	remaining: 2.42s
200:	learn: 0.5282901	total: 1.06s	remaining: 1.57s
300:	learn: 0.4922613	total: 1.52s	remaining: 1.01s
400:	learn: 0.4666989	total: 1.95s	remaining: 481ms
499:	learn: 0.4458664	total: 2.37s	remaining: 0us

Results for Copies Sold with log transformation:
R² Score: 0.7720
MAE: 160349.26
RMSE: 914071.62
Model saved as transformed_log_Copies_Sold.pkl



ValueError: Data must be 1-dimensional.

In [7]:
import numpy as np
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load your dataset
df = pd.read_csv("Proyecto_ML.csv")

# Define targets
targets = ["Copies Sold", "Wishlists", "bayesian_score"]

# Define feature set (exclude target variables)
drop_cols = targets + ["appid", "name", "release_date", "developers", "publishers"]
X = df.drop(columns=drop_cols, errors='ignore')
X = X.select_dtypes(include='number')  # Keep only numeric features

# Plot original target distribution
plt.figure(figsize=(18, 6))

for i, target in enumerate(targets[:2]):  # Only plot Copies Sold and Wishlists (skip bayesian_score)
    plt.subplot(1, 2, i+1)
    
    # Use log scale to visualize the extreme skew
    sns.histplot(df[target], bins=50, kde=True)
    plt.title(f'Original Distribution: {target}')
    plt.yscale('log')
    plt.xscale('log')
    
plt.tight_layout()
plt.savefig('target_distribution_before.png', dpi=300)
plt.close()

# Define log transformation functions
def log_transform(y):
    return np.log1p(y)  # log1p = log(1+y) to handle zeros

def inverse_log_transform(y):
    return np.expm1(y)  # expm1 = exp(y)-1, inverse of log1p

# Create a function to train models with log-transformed targets
def train_log_model(X, y, target_name, test_size=0.2, random_state=42):
    """Train a CatBoost model with log-transformed target."""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Create log transformer
    transformer = FunctionTransformer(log_transform, inverse_func=inverse_log_transform)
    
    # Create CatBoost regressor
    regressor = CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        loss_function='RMSE',
        random_seed=random_state,
        verbose=100
    )
    
    # Create TransformedTargetRegressor
    model = TransformedTargetRegressor(
        regressor=regressor,
        transformer=transformer
    )
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions (automatically un-transforms the target)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    metrics = {
        'r2': r2,
        'mae': mae,
        'rmse': rmse
    }
    
    # Print results
    print(f"\nResults for {target_name} with log transformation:")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    
    return model, X_test, y_test, y_pred, metrics

# Store results
results = {}

# Train log-transformed models for each target
for target in targets:
    y = df[target]
    
    print(f"\nTraining log-transformed model for {target}...")
    model, X_test, y_test, y_pred, metrics = train_log_model(X, y, target)
    
    key = f"{target}_log"
    results[key] = {
        'model': model,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred,
        'metrics': metrics
    }
    
    # Save the model
    model_filename = f"log_transformed_{target.replace(' ', '_')}.pkl"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")

# Visualize predictions vs actual values
plt.figure(figsize=(15, 15))

for i, target in enumerate(targets):
    key = f"{target}_log"
    
    plt.subplot(len(targets), 1, i+1)
    
    y_test = results[key]['y_test']
    y_pred = results[key]['y_pred']
    r2 = results[key]['metrics']['r2']
    
    # Plot original scale
    plt.scatter(y_test, y_pred, alpha=0.5, edgecolor='white', s=40)
    
    # Add reference line
    max_val = max(y_test.max(), y_pred.max())
    min_val = min(y_test.min(), y_pred.min())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8)
    
    # Use log scale for Copies Sold and Wishlists
    if target in ["Copies Sold", "Wishlists"]:
        plt.xscale('log')
        plt.yscale('log')
    
    plt.title(f'{target} - Log Transform (R² = {r2:.4f})')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('log_transformed_predictions.png', dpi=300)
plt.close()

# Try to compare with original models if available
improvement_summary = []

for target in targets:
    try:
        # Load original model (you may need to adjust the filename)
        original_model_path = f"catboost_model_{target.replace(' ', '_')}.pkl"
        original_model = joblib.load(original_model_path)
        
        # Make predictions with original model
        original_preds = original_model.predict(results[f"{target}_log"]['X_test'])
        original_r2 = r2_score(results[f"{target}_log"]['y_test'], original_preds)
        
        # Get transformed model results
        transformed_r2 = results[f"{target}_log"]['metrics']['r2']
        
        # Calculate improvement
        improvement = transformed_r2 - original_r2
        improvement_pct = (improvement / original_r2) * 100 if original_r2 > 0 else np.nan
        
        improvement_summary.append({
            'Target': target,
            'Original R²': original_r2,
            'Log-Transformed R²': transformed_r2,
            'Improvement': improvement,
            'Improvement %': improvement_pct
        })
        
        # Plot comparison of residuals
        plt.figure(figsize=(12, 6))
        
        # Original model residuals
        plt.subplot(1, 2, 1)
        residuals_orig = original_preds - results[f"{target}_log"]['y_test']
        plt.scatter(original_preds, residuals_orig, alpha=0.5)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.title(f'Original Model Residuals - {target}')
        plt.xlabel('Predicted')
        plt.ylabel('Residuals')
        
        # Transformed model residuals
        plt.subplot(1, 2, 2)
        residuals_trans = results[f"{target}_log"]['y_pred'] - results[f"{target}_log"]['y_test']
        plt.scatter(results[f"{target}_log"]['y_pred'], residuals_trans, alpha=0.5)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.title(f'Log-Transformed Model Residuals - {target}')
        plt.xlabel('Predicted')
        plt.ylabel('Residuals')
        
        plt.tight_layout()
        plt.savefig(f'residual_comparison_{target}.png', dpi=300)
        plt.close()
        
    except (FileNotFoundError, KeyError) as e:
        print(f"Could not compare with original model for {target}: {e}")

# Display improvement summary
if improvement_summary:
    improvement_df = pd.DataFrame(improvement_summary)
    print("\nImprovement Summary:")
    print(improvement_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
    
    # Save summary to CSV
    improvement_df.to_csv('log_transformation_improvements.csv', index=False)
else:
    print("\nNo comparison with original models was possible.")


Training log-transformed model for Copies Sold...
0:	learn: 1.8026347	total: 3.78ms	remaining: 1.89s
100:	learn: 0.5951611	total: 537ms	remaining: 2.12s
200:	learn: 0.5282901	total: 1.03s	remaining: 1.53s
300:	learn: 0.4922613	total: 1.54s	remaining: 1.02s
400:	learn: 0.4666989	total: 2.04s	remaining: 503ms
499:	learn: 0.4458664	total: 2.53s	remaining: 0us

Results for Copies Sold with log transformation:
R² Score: 0.7720
MAE: 160349.26
RMSE: 914071.62
Model saved as log_transformed_Copies_Sold.pkl

Training log-transformed model for Wishlists...
0:	learn: 1.6173342	total: 4.21ms	remaining: 2.1s
100:	learn: 0.5128806	total: 484ms	remaining: 1.91s
200:	learn: 0.4744640	total: 981ms	remaining: 1.46s
300:	learn: 0.4478259	total: 1.44s	remaining: 953ms
400:	learn: 0.4273571	total: 1.92s	remaining: 473ms
499:	learn: 0.4124890	total: 2.38s	remaining: 0us

Results for Wishlists with log transformation:
R² Score: 0.7867
MAE: 29436.44
RMSE: 118690.82
Model saved as log_transformed_Wishlists.pk