# Machine Learning Regression Model

This notebook implements an ensemble regression model using XGBoost, Gradient Boosting, and Random Forest.

In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import datetime

# Suppress warnings
warnings.filterwarnings('ignore')

# Set seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Loading Data

In [None]:
print("Loading datasets...")
# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

## Feature Engineering Functions

In [None]:
# Function to preprocess date features
def process_date_features(df):
    # Convert to datetime
    df['publication_timestamp'] = pd.to_datetime(df['publication_timestamp'], format='%d-%m-%Y', errors='coerce')
    
    # Extract features
    df['release_year'] = df['publication_timestamp'].dt.year
    df['release_month'] = df['publication_timestamp'].dt.month
    df['release_day'] = df['publication_timestamp'].dt.day
    df['days_since_2000'] = (df['publication_timestamp'] - pd.Timestamp('2000-01-01')).dt.days
    
    # Create cyclical features for month
    df['month_sin'] = np.sin(2 * np.pi * df['release_month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['release_month']/12)
    
    return df

# Function to create interaction features
def create_interaction_features(df):
    # Create meaningful feature interactions
    df['energy_dance_product'] = df['intensity_index_0'] * df['rhythmic_cohesion_0']
    df['energy_dance_product_1'] = df['intensity_index_1'].fillna(0) * df['rhythmic_cohesion_1'].fillna(0)
    df['energy_dance_product_2'] = df['intensity_index_2'].fillna(0) * df['rhythmic_cohesion_2'].fillna(0)
    
    # Song length features
    df['avg_duration'] = (df['duration_ms_0'] + df['duration_ms_1'].fillna(0) + df['duration_ms_2'].fillna(0)) / 3
    df['total_duration'] = df['duration_ms_0'] + df['duration_ms_1'].fillna(0) + df['duration_ms_2'].fillna(0)
    
    # Ratio features
    df['organic_to_energy_0'] = df['organic_texture_0'] / (df['intensity_index_0'] + 0.001)
    
    # Feature aggregation across tracks
    for metric in ['rhythmic_cohesion', 'intensity_index', 'organic_texture', 'emotional_resonance']:
        for agg_func, suffix in [('mean', 'mean'), ('std', 'std'), ('max', 'max'), ('min', 'min')]:
            cols = [f"{metric}_{i}" for i in range(3)]
            df[f"{metric}_{suffix}"] = df[cols].apply(lambda x: getattr(np, agg_func)(x.dropna()), axis=1)
    
    return df

## Data Preprocessing

In [None]:
# Process both datasets
print("Processing features...")
for df in [train_df, test_df]:
    df = process_date_features(df)
    df = create_interaction_features(df)

In [None]:
# Identify column types
categorical_cols = ['weekday_of_release', 'season_of_release', 'lunar_phase']
composition_cols = [col for col in train_df.columns if 'composition_label' in col or 'creator_collective' in col]
categorical_cols.extend(composition_cols)

# Check data types in train_df to ensure all object/string columns are treated as categorical
for col in train_df.columns:
    if train_df[col].dtype == 'object' or pd.api.types.is_string_dtype(train_df[col]):
        if col not in categorical_cols and col != 'id' and 'timestamp' not in col:
            categorical_cols.append(col)
            print(f"Added column to categorical: {col}")

In [None]:
# Handle high-cardinality categorical features
high_cardinality_threshold = 50  # Adjust this threshold as needed
high_cardinality_cols = []
for col in categorical_cols:
    if train_df[col].nunique() > high_cardinality_threshold:
        high_cardinality_cols.append(col)
        print(f"High cardinality column: {col} with {train_df[col].nunique()} unique values")

# For high cardinality columns, keep only the top N most frequent categories
for col in high_cardinality_cols:
    top_categories = train_df[col].value_counts().nlargest(high_cardinality_threshold).index
    train_df[col] = train_df[col].apply(lambda x: x if x in top_categories else 'Other')
    test_df[col] = test_df[col].apply(lambda x: x if x in top_categories else 'Other')
    print(f"Limited {col} to top {high_cardinality_threshold} categories plus 'Other'")

In [None]:
# Get all remaining columns except target and id
numerical_cols = [col for col in train_df.columns 
                 if col not in categorical_cols and col != 'target' and col != 'id' 
                 and 'timestamp' not in col]

# Print column types for debugging
print(f"Number of numerical columns: {len(numerical_cols)}")
print(f"Number of categorical columns: {len(categorical_cols)}")

## Model Building

In [None]:
# Prepare feature engineering pipeline
print("Building preprocessing pipeline...")
# Numerical pipeline with imputation and scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline with imputation and one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ], remainder='drop', n_jobs=1
)

In [None]:
# Split features and target
X = train_df.drop('target', axis=1)
y = train_df['target']

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
print("Training models...")

# Create ensemble model
ensemble_model = VotingRegressor([
    ('xgb', XGBRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )),
    ('gbr', GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=RANDOM_STATE
    )),
    ('rf', RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

# Create full pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ensemble_model)
])

In [None]:
# Fit the model
model_pipeline.fit(X_train, y_train)

## Model Evaluation

In [None]:
# Evaluate on validation set
val_predictions = model_pipeline.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE: {val_rmse:.4f}")

## Generate Predictions

In [None]:
# Prediction clipping function to enforce bounds
def clip_predictions(preds):
    return np.clip(preds, 1, 100)

# Make predictions on test set
print("Generating predictions...")
test_predictions = clip_predictions(model_pipeline.predict(test_df))

In [None]:
# Create submission file
submission['target'] = test_predictions
submission.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

## Feature Importance Analysis

In [None]:
# Feature importance (if using XGBoost alone)
try:
    xgb_model = model_pipeline.named_steps['model'].estimators_[0]
    feature_names = numerical_cols + list(model_pipeline.named_steps['preprocessor']
                                  .named_transformers_['cat']
                                  .named_steps['onehot']
                                  .get_feature_names_out(categorical_cols))
    
    # Get feature importances
    importances = xgb_model.feature_importances_
    
    # Create a DataFrame for visualization
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False).head(20)
    
    # Plot feature importances
    import matplotlib.pyplot as plt
    plt.figure(figsize=(12, 8))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Could not extract feature importances: {e}")