In [None]:
"""
predict_blends_lgbm.py
Score‑oriented pipeline for BlendProperty prediction.
▶ Requires: lightgbm, pandas, numpy, scikit‑learn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
!pip install lightgbm
!pip install --upgrade lightgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor

# -------------------------------
# 1. Load data
# -------------------------------
TRAIN_PATH = "/bin/testing1/train.csv"
TEST_PATH = "/bin/testing1/test.csv"
OUT_PATH = "/bin/testing1/predicted_solution.csv"


train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

target_cols  = [c for c in train.columns if c.startswith("BlendProperty")]
feature_cols = [c for c in train.columns if c not in target_cols + ["ID"]]

X = train[feature_cols]
X_test = test[feature_cols]

# -------------------------------
# 2. Basic feature engineering
#    (fast but effective)
# -------------------------------
def add_interactions(df):
    frac_cols     = [c for c in df.columns if "fraction"  in c]
    prop1_cols    = [c for c in df.columns if "Property1" in c]
    # Pairwise products of fractions
    for i, c1 in enumerate(frac_cols):
        for c2 in frac_cols[i+1:]:
            df[f"{c1}*{c2}"] = df[c1] * df[c2]
    # Product of fraction and its Property1
    for fc, pc in zip(frac_cols, prop1_cols):
        df[f"{fc}x{pc}"] = df[fc] * df[pc]
    return df

X       = add_interactions(X.copy())
X_test  = add_interactions(X_test.copy())
feature_cols = X.columns  # update to include new features

# -------------------------------
# 3. Cross‑validated training
# -------------------------------
N_SPLITS   = 5
SEED_LIST  = [0, 1, 2]      # ensembling seeds
oof_preds  = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test),  len(target_cols)))

for seed in SEED_LIST:
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    for t_idx, target in enumerate(target_cols):
        fold_test_preds = np.zeros(len(test))
        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            model = LGBMRegressor(
                n_estimators=4000,
                learning_rate=0.015,
                num_leaves=255,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed + fold,
                objective="regression",
                metric="rmse",
                n_jobs=-1
            )

            model.fit(
                    X.iloc[trn_idx], train[target].iloc[trn_idx],
                    eval_set=[(X.iloc[val_idx], train[target].iloc[val_idx])],
                    eval_metric="rmse"
                )

            # Out‑of‑fold
            oof_preds[val_idx, t_idx] = model.predict(X.iloc[val_idx])
            # Accumulate test
            fold_test_preds += model.predict(X_test) / N_SPLITS
        test_preds[:, t_idx] += fold_test_preds / len(SEED_LIST)

# -------------------------------
# 4. Save submission
# -------------------------------
subm = pd.DataFrame(test_preds, columns=target_cols)
subm.insert(0, "ID", test["ID"])
subm.to_csv(OUT_PATH, index=False)
print(f"Submission saved to {OUT_PATH}")

# Optional: print OOF CV RMSE per target
from sklearn.metrics import mean_squared_error
for i, tgt in enumerate(target_cols):
    rmse = mean_squared_error(train[tgt], oof_preds[:, i], squared=False)
    print(f"{tgt:<15}: CV RMSE = {rmse:.4f}")


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.5.0
    Uninstalling lightgbm-4.5.0:
      Successfully uninstalled lightgbm-4.5.0
Successfully installed lightgbm-4.6.0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16759
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 70
[LightGBM] [Info] Start training from score 0.003103


Exception ignored on calling ctypes callback function: <function _log_callback at 0x79b5361adda0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 257, in _log_callback
    def _normalize_native_string(func: Callable[[str], None]) -> Callable[[str], None]:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


Exception ignored on calling ctypes callback function: <function _log_callback at 0x79b5361adda0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 257, in _log_callback
    def _normalize_native_string(func: Callable[[str], None]) -> Callable[[str], None]:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16752
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 70
[LightGBM] [Info] Start training from score -0.026405


Exception ignored on calling ctypes callback function: <function _log_callback at 0x79b5361adda0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 257, in _log_callback
    def _normalize_native_string(func: Callable[[str], None]) -> Callable[[str], None]:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


In [None]:
"""
predict_blends_lgbm.py
Score‑oriented pipeline for BlendProperty prediction.
▶ Requires: lightgbm, pandas, numpy, scikit‑learn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
!pip install lightgbm
!pip install --upgrade lightgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor

# -------------------------------
# 1. Load data
# -------------------------------
TRAIN_PATH = "/bin/testing1/train.csv"
TEST_PATH = "/bin/testing1/test.csv"
OUT_PATH = "/bin/testing1/predicted_solution.csv"


train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

target_cols  = [c for c in train.columns if c.startswith("BlendProperty")]
feature_cols = [c for c in train.columns if c not in target_cols + ["ID"]]

X = train[feature_cols]
X_test = test[feature_cols]

# -------------------------------
# 2. Basic feature engineering
#    (fast but effective)
# -------------------------------
def add_interactions(df):
    frac_cols     = [c for c in df.columns if "fraction"  in c]
    prop1_cols    = [c for c in df.columns if "Property1" in c]
    # Pairwise products of fractions
    for i, c1 in enumerate(frac_cols):
        for c2 in frac_cols[i+1:]:
            df[f"{c1}*{c2}"] = df[c1] * df[c2]
    # Product of fraction and its Property1
    for fc, pc in zip(frac_cols, prop1_cols):
        df[f"{fc}x{pc}"] = df[fc] * df[pc]
    return df

X       = add_interactions(X.copy())
X_test  = add_interactions(X_test.copy())
feature_cols = X.columns  # update to include new features

# -------------------------------
# 3. Cross‑validated training
# -------------------------------
N_SPLITS   = 5
SEED_LIST  = [0, 1, 2]      # ensembling seeds
oof_preds  = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test),  len(target_cols)))

for seed in SEED_LIST:
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    for t_idx, target in enumerate(target_cols):
        fold_test_preds = np.zeros(len(test))
        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            model = LGBMRegressor(
                n_estimators=4000,
                learning_rate=0.015,
                num_leaves=255,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed + fold,
                objective="regression",
                metric="rmse",
                n_jobs=-1
            )

            model.fit(
                    X.iloc[trn_idx], train[target].iloc[trn_idx],
                    eval_set=[(X.iloc[val_idx], train[target].iloc[val_idx])],
                    eval_metric="rmse"
                )

            # Out‑of‑fold
            oof_preds[val_idx, t_idx] = model.predict(X.iloc[val_idx])
            # Accumulate test
            fold_test_preds += model.predict(X_test) / N_SPLITS
        test_preds[:, t_idx] += fold_test_preds / len(SEED_LIST)

# -------------------------------
# 4. Save submission
# -------------------------------
subm = pd.DataFrame(test_preds, columns=target_cols)
subm.insert(0, "ID", test["ID"])
subm.to_csv(OUT_PATH, index=False)
print(f"Submission saved to {OUT_PATH}")

# Optional: print OOF CV RMSE per target
from sklearn.metrics import mean_squared_error
for i, tgt in enumerate(target_cols):
    rmse = mean_squared_error(train[tgt], oof_preds[:, i], squared=False)
    print(f"{tgt:<15}: CV RMSE = {rmse:.4f}")


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.5.0
    Uninstalling lightgbm-4.5.0:
      Successfully uninstalled lightgbm-4.5.0
Successfully installed lightgbm-4.6.0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16759
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 70
[LightGBM] [Info] Start training from score -0.021847


In [1]:
"""
predict_blends_lgbm.py
Score‑oriented pipeline for BlendProperty prediction.
▶ Requires: lightgbm, pandas, numpy, scikit‑learn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Install LightGBM if not available
try:
    import lightgbm as lgb
    from lightgbm import LGBMRegressor
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])
    import lightgbm as lgb
    from lightgbm import LGBMRegressor

# -------------------------------
# 1. Load data
# -------------------------------
TRAIN_PATH = "/bin/testing1/train.csv"
TEST_PATH = "/bin/testing1/test.csv"
OUT_PATH = "/bin/testing1/predicted_solution.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

target_cols = [c for c in train.columns if c.startswith("BlendProperty")]
feature_cols = [c for c in train.columns if c not in target_cols + ["ID"]]

X = train[feature_cols]
X_test = test[feature_cols]

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Target columns: {len(target_cols)}")
print(f"Feature columns: {len(feature_cols)}")

# -------------------------------
# 2. Enhanced feature engineering
#    (optimized for better performance)
# -------------------------------
def add_interactions(df):
    """Enhanced feature engineering with optimized interactions"""
    df = df.copy()

    # Identify key column groups
    frac_cols = [c for c in df.columns if "fraction" in c.lower()]
    prop1_cols = [c for c in df.columns if "Property1" in c]
    prop2_cols = [c for c in df.columns if "Property2" in c]

    print(f"Creating features from {len(frac_cols)} fractions, {len(prop1_cols)} Property1 cols")

    # 1. Fraction interactions (limited to avoid explosion)
    for i, c1 in enumerate(frac_cols[:15]):  # Limit to top fractions
        for c2 in frac_cols[i+1:min(i+8, len(frac_cols))]:  # Selective pairing
            df[f"{c1}*{c2}"] = df[c1] * df[c2]

    # 2. Fraction-Property interactions
    for fc, pc in zip(frac_cols, prop1_cols):
        df[f"{fc}x{pc}"] = df[fc] * df[pc]

    # 3. Statistical features by group
    if len(frac_cols) > 1:
        df['frac_sum'] = df[frac_cols].sum(axis=1)
        df['frac_mean'] = df[frac_cols].mean(axis=1)
        df['frac_std'] = df[frac_cols].std(axis=1)
        df['frac_max'] = df[frac_cols].max(axis=1)
        df['frac_min'] = df[frac_cols].min(axis=1)
        df['frac_range'] = df['frac_max'] - df['frac_min']

    if len(prop1_cols) > 1:
        df['prop1_mean'] = df[prop1_cols].mean(axis=1)
        df['prop1_std'] = df[prop1_cols].std(axis=1)
        df['prop1_max'] = df[prop1_cols].max(axis=1)
        df['prop1_min'] = df[prop1_cols].min(axis=1)
        df['prop1_range'] = df['prop1_max'] - df['prop1_min']

    # 4. Ratio features (selective to avoid too many)
    for i, c1 in enumerate(frac_cols[:10]):
        for c2 in frac_cols[i+1:min(i+5, len(frac_cols))]:
            df[f"{c1}_ratio_{c2}"] = df[c1] / (df[c2] + 1e-8)

    # 5. Weighted combinations
    if len(frac_cols) >= 2 and len(prop1_cols) >= 2:
        # Weighted average of properties by fractions
        total_frac = df[frac_cols].sum(axis=1) + 1e-8
        weighted_prop = sum(df[fc] * df[pc] for fc, pc in zip(frac_cols, prop1_cols)) / total_frac
        df['weighted_prop_avg'] = weighted_prop

    return df

def preprocess_data(df):
    """Handle missing values and outliers"""
    df = df.copy()

    # Handle missing values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)

    # Handle infinite values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in numeric_cols:
        if df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)

    # Light outlier clipping (preserves data while removing extreme outliers)
    for col in numeric_cols:
        Q1 = df[col].quantile(0.01)
        Q99 = df[col].quantile(0.99)
        df[col] = df[col].clip(Q1, Q99)

    return df

# Apply preprocessing
X = preprocess_data(X)
X_test = preprocess_data(X_test)

# Apply feature engineering
X = add_interactions(X)
X_test = add_interactions(X_test)

# Update feature columns
feature_cols = X.columns
print(f"Total features after engineering: {len(feature_cols)}")

# -------------------------------
# 3. Optimized cross-validated training
# -------------------------------
N_SPLITS = 5
SEED_LIST = [0, 1, 2]  # ensembling seeds
oof_preds = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test), len(target_cols)))

# Optimized LightGBM parameters
lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 255,
    'learning_rate': 0.015,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

print("Starting cross-validation training...")

for seed in SEED_LIST:
    print(f"\nTraining with seed {seed}...")
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)

    for t_idx, target in enumerate(target_cols):
        print(f"  Training {target}...")
        fold_test_preds = np.zeros(len(test))
        fold_scores = []

        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            # Prepare data
            X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
            y_trn, y_val = train[target].iloc[trn_idx], train[target].iloc[val_idx]

            # Create model with optimized parameters
            model = LGBMRegressor(
                n_estimators=4000,
                early_stopping_rounds=100,
                **lgbm_params
            )

            # Fit model
            model.fit(
                X_trn, y_trn,
                eval_set=[(X_val, y_val)],
                eval_metric="rmse",
                callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
            )

            # Predictions
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)

            # Store out-of-fold predictions (only for first seed)
            if seed == SEED_LIST[0]:
                oof_preds[val_idx, t_idx] = val_pred

            # Accumulate test predictions
            fold_test_preds += test_pred / N_SPLITS

            # Track fold score
            fold_score = mean_squared_error(y_val, val_pred, squared=False)
            fold_scores.append(fold_score)

        # Average across folds for this seed
        test_preds[:, t_idx] += fold_test_preds / len(SEED_LIST)

        # Print seed performance
        avg_score = np.mean(fold_scores)
        print(f"    Seed {seed} {target}: {avg_score:.4f} RMSE")

# -------------------------------
# 4. Enhanced results and validation
# -------------------------------
print("\n" + "="*60)
print("FINAL VALIDATION RESULTS")
print("="*60)

# Calculate and display OOF scores
for i, tgt in enumerate(target_cols):
    oof_rmse = mean_squared_error(train[tgt], oof_preds[:, i], squared=False)

    # Calculate R² score
    ss_res = np.sum((train[tgt] - oof_preds[:, i]) ** 2)
    ss_tot = np.sum((train[tgt] - np.mean(train[tgt])) ** 2)
    r2_score = 1 - (ss_res / ss_tot)

    print(f"{tgt:<20}: RMSE = {oof_rmse:.4f}, R² = {r2_score:.4f}")

# -------------------------------
# 5. Save submission with validation
# -------------------------------
subm = pd.DataFrame(test_preds, columns=target_cols)
subm.insert(0, "ID", test["ID"])

# Basic validation of predictions
print(f"\nPrediction validation:")
print(f"Predictions shape: {subm.shape}")
print(f"No missing values: {not subm.isnull().any().any()}")
print(f"No infinite values: {not np.isinf(subm.select_dtypes(include=[np.number])).any().any()}")

# Display prediction statistics
print(f"\nPrediction statistics:")
for target in target_cols:
    pred_stats = subm[target].describe()
    print(f"{target}: min={pred_stats['min']:.4f}, max={pred_stats['max']:.4f}, mean={pred_stats['mean']:.4f}")

# Save results
subm.to_csv(OUT_PATH, index=False)
print(f"\nSubmission saved to {OUT_PATH}")

# Performance summary
overall_rmse = np.mean([mean_squared_error(train[tgt], oof_preds[:, i], squared=False)
                       for i, tgt in enumerate(target_cols)])
print(f"\nOverall CV RMSE: {overall_rmse:.4f}")
print(f"Model used {len(feature_cols)} features")
print(f"Training completed with {len(SEED_LIST)} seeds and {N_SPLITS}-fold CV")

Training data shape: (2000, 65)
Test data shape: (500, 56)
Target columns: 10
Feature columns: 55
Creating features from 5 fractions, 10 Property1 cols
Creating features from 5 fractions, 10 Property1 cols
Total features after engineering: 92
Starting cross-validation training...

Training with seed 0...
  Training BlendProperty1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1980]	valid_0's rmse: 0.132871


TypeError: got an unexpected keyword argument 'squared'

In [2]:
"""
predict_blends_lgbm.py
Score‑oriented pipeline for BlendProperty prediction.
▶ Requires: lightgbm, pandas, numpy, scikit‑learn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Install LightGBM if not available
try:
    import lightgbm as lgb
    from lightgbm import LGBMRegressor
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])
    import lightgbm as lgb
    from lightgbm import LGBMRegressor

# -------------------------------
# 1. Load data
# -------------------------------
TRAIN_PATH = "/bin/testing1/train.csv"
TEST_PATH = "/bin/testing1/test.csv"
OUT_PATH = "/bin/testing1/predicted_solution.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

target_cols = [c for c in train.columns if c.startswith("BlendProperty")]
feature_cols = [c for c in train.columns if c not in target_cols + ["ID"]]

X = train[feature_cols]
X_test = test[feature_cols]

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"Target columns: {len(target_cols)}")
print(f"Feature columns: {len(feature_cols)}")

# -------------------------------
# 2. Enhanced feature engineering
#    (optimized for better performance)
# -------------------------------
def add_interactions(df):
    """Enhanced feature engineering with optimized interactions"""
    df = df.copy()

    # Identify key column groups
    frac_cols = [c for c in df.columns if "fraction" in c.lower()]
    prop1_cols = [c for c in df.columns if "Property1" in c]
    prop2_cols = [c for c in df.columns if "Property2" in c]

    print(f"Creating features from {len(frac_cols)} fractions, {len(prop1_cols)} Property1 cols")

    # 1. Fraction interactions (limited to avoid explosion)
    for i, c1 in enumerate(frac_cols[:15]):  # Limit to top fractions
        for c2 in frac_cols[i+1:min(i+8, len(frac_cols))]:  # Selective pairing
            df[f"{c1}*{c2}"] = df[c1] * df[c2]

    # 2. Fraction-Property interactions
    for fc, pc in zip(frac_cols, prop1_cols):
        df[f"{fc}x{pc}"] = df[fc] * df[pc]

    # 3. Statistical features by group
    if len(frac_cols) > 1:
        df['frac_sum'] = df[frac_cols].sum(axis=1)
        df['frac_mean'] = df[frac_cols].mean(axis=1)
        df['frac_std'] = df[frac_cols].std(axis=1)
        df['frac_max'] = df[frac_cols].max(axis=1)
        df['frac_min'] = df[frac_cols].min(axis=1)
        df['frac_range'] = df['frac_max'] - df['frac_min']

    if len(prop1_cols) > 1:
        df['prop1_mean'] = df[prop1_cols].mean(axis=1)
        df['prop1_std'] = df[prop1_cols].std(axis=1)
        df['prop1_max'] = df[prop1_cols].max(axis=1)
        df['prop1_min'] = df[prop1_cols].min(axis=1)
        df['prop1_range'] = df['prop1_max'] - df['prop1_min']

    # 4. Ratio features (selective to avoid too many)
    for i, c1 in enumerate(frac_cols[:10]):
        for c2 in frac_cols[i+1:min(i+5, len(frac_cols))]:
            df[f"{c1}_ratio_{c2}"] = df[c1] / (df[c2] + 1e-8)

    # 5. Weighted combinations
    if len(frac_cols) >= 2 and len(prop1_cols) >= 2:
        # Weighted average of properties by fractions
        total_frac = df[frac_cols].sum(axis=1) + 1e-8
        weighted_prop = sum(df[fc] * df[pc] for fc, pc in zip(frac_cols, prop1_cols)) / total_frac
        df['weighted_prop_avg'] = weighted_prop

    return df

def preprocess_data(df):
    """Handle missing values and outliers"""
    df = df.copy()

    # Handle missing values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)

    # Handle infinite values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in numeric_cols:
        if df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)

    # Light outlier clipping (preserves data while removing extreme outliers)
    for col in numeric_cols:
        Q1 = df[col].quantile(0.01)
        Q99 = df[col].quantile(0.99)
        df[col] = df[col].clip(Q1, Q99)

    return df

# Apply preprocessing
X = preprocess_data(X)
X_test = preprocess_data(X_test)

# Apply feature engineering
X = add_interactions(X)
X_test = add_interactions(X_test)

# Update feature columns
feature_cols = X.columns
print(f"Total features after engineering: {len(feature_cols)}")

# -------------------------------
# 3. Optimized cross-validated training
# -------------------------------
N_SPLITS = 5
SEED_LIST = [0, 1, 2]  # ensembling seeds
oof_preds = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test), len(target_cols)))

# Optimized LightGBM parameters
lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 255,
    'learning_rate': 0.015,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

print("Starting cross-validation training...")

for seed in SEED_LIST:
    print(f"\nTraining with seed {seed}...")
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)

    for t_idx, target in enumerate(target_cols):
        print(f"  Training {target}...")
        fold_test_preds = np.zeros(len(test))
        fold_scores = []

        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            # Prepare data
            X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
            y_trn, y_val = train[target].iloc[trn_idx], train[target].iloc[val_idx]

            # Create model with optimized parameters
            model = LGBMRegressor(
                n_estimators=4000,
                early_stopping_rounds=100,
                **lgbm_params
            )

            # Fit model
            model.fit(
                X_trn, y_trn,
                eval_set=[(X_val, y_val)],
                eval_metric="rmse",
                callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
            )

            # Predictions
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)

            # Store out-of-fold predictions (only for first seed)
            if seed == SEED_LIST[0]:
                oof_preds[val_idx, t_idx] = val_pred

            # Accumulate test predictions
            fold_test_preds += test_pred / N_SPLITS

            # Track fold score - FIXED: Calculate RMSE manually
            fold_score = np.sqrt(mean_squared_error(y_val, val_pred))
            fold_scores.append(fold_score)

        # Average across folds for this seed
        test_preds[:, t_idx] += fold_test_preds / len(SEED_LIST)

        # Print seed performance
        avg_score = np.mean(fold_scores)
        print(f"    Seed {seed} {target}: {avg_score:.4f} RMSE")

# -------------------------------
# 4. Enhanced results and validation
# -------------------------------
print("\n" + "="*60)
print("FINAL VALIDATION RESULTS")
print("="*60)

# Calculate and display OOF scores
for i, tgt in enumerate(target_cols):
    # FIXED: Calculate RMSE manually
    oof_rmse = np.sqrt(mean_squared_error(train[tgt], oof_preds[:, i]))

    # Calculate R² score
    ss_res = np.sum((train[tgt] - oof_preds[:, i]) ** 2)
    ss_tot = np.sum((train[tgt] - np.mean(train[tgt])) ** 2)
    r2_score = 1 - (ss_res / ss_tot)

    print(f"{tgt:<20}: RMSE = {oof_rmse:.4f}, R² = {r2_score:.4f}")

# -------------------------------
# 5. Save submission with validation
# -------------------------------
subm = pd.DataFrame(test_preds, columns=target_cols)
subm.insert(0, "ID", test["ID"])

# Basic validation of predictions
print(f"\nPrediction validation:")
print(f"Predictions shape: {subm.shape}")
print(f"No missing values: {not subm.isnull().any().any()}")
print(f"No infinite values: {not np.isinf(subm.select_dtypes(include=[np.number])).any().any()}")

# Display prediction statistics
print(f"\nPrediction statistics:")
for target in target_cols:
    pred_stats = subm[target].describe()
    print(f"{target}: min={pred_stats['min']:.4f}, max={pred_stats['max']:.4f}, mean={pred_stats['mean']:.4f}")

# Save results
subm.to_csv(OUT_PATH, index=False)
print(f"\nSubmission saved to {OUT_PATH}")

# Performance summary - FIXED: Calculate RMSE manually
overall_rmse = np.mean([np.sqrt(mean_squared_error(train[tgt], oof_preds[:, i]))
                       for i, tgt in enumerate(target_cols)])
print(f"\nOverall CV RMSE: {overall_rmse:.4f}")
print(f"Model used {len(feature_cols)} features")
print(f"Training completed with {len(SEED_LIST)} seeds and {N_SPLITS}-fold CV")

Training data shape: (2000, 65)
Test data shape: (500, 56)
Target columns: 10
Feature columns: 55
Creating features from 5 fractions, 10 Property1 cols
Creating features from 5 fractions, 10 Property1 cols
Total features after engineering: 92
Starting cross-validation training...

Training with seed 0...
  Training BlendProperty1...
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2146]	valid_0's rmse: 0.136924
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
    Seed 0 BlendProperty1: 0.1322 RMSE
  Training BlendProperty2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2135]	valid_0's rmse: 0.195879
Training until validation scores don't improve for 100 rounds
Training until validation sco

In [None]:
"""
predict_blends_lgbm.py
Score‑oriented pipeline for BlendProperty prediction.
▶ Requires: lightgbm, pandas, numpy, scikit‑learn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
!pip install lightgbm
!pip install --upgrade lightgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor

# -------------------------------
# 1. Load data
# -------------------------------
TRAIN_PATH = "/bin/testing1/train.csv"
TEST_PATH = "/bin/testing1/test.csv"
OUT_PATH = "/bin/testing1/predicted_solution.csv"


train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

target_cols  = [c for c in train.columns if c.startswith("BlendProperty")]
feature_cols = [c for c in train.columns if c not in target_cols + ["ID"]]

X = train[feature_cols]
X_test = test[feature_cols]

# -------------------------------
# 2. Basic feature engineering
#    (fast but effective)
# -------------------------------
def add_interactions(df):
    frac_cols     = [c for c in df.columns if "fraction"  in c]
    prop1_cols    = [c for c in df.columns if "Property1" in c]
    # Pairwise products of fractions
    for i, c1 in enumerate(frac_cols):
        for c2 in frac_cols[i+1:]:
            df[f"{c1}*{c2}"] = df[c1] * df[c2]
    # Product of fraction and its Property1
    for fc, pc in zip(frac_cols, prop1_cols):
        df[f"{fc}x{pc}"] = df[fc] * df[pc]
    return df

X       = add_interactions(X.copy())
X_test  = add_interactions(X_test.copy())
feature_cols = X.columns  # update to include new features

# -------------------------------
# 3. Cross‑validated training
# -------------------------------
N_SPLITS   = 5
SEED_LIST  = [0, 1, 2]      # ensembling seeds
oof_preds  = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test),  len(target_cols)))

for seed in SEED_LIST:
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    for t_idx, target in enumerate(target_cols):
        fold_test_preds = np.zeros(len(test))
        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            model = LGBMRegressor(
                n_estimators=4000,
                learning_rate=0.015,
                num_leaves=255,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed + fold,
                objective="regression",
                metric="rmse",
                n_jobs=-1
            )

            model.fit(
                    X.iloc[trn_idx], train[target].iloc[trn_idx],
                    eval_set=[(X.iloc[val_idx], train[target].iloc[val_idx])],
                    eval_metric="rmse"
                )

            # Out‑of‑fold
            oof_preds[val_idx, t_idx] = model.predict(X.iloc[val_idx])
            # Accumulate test
            fold_test_preds += model.predict(X_test) / N_SPLITS
        test_preds[:, t_idx] += fold_test_preds / len(SEED_LIST)

# -------------------------------
# 4. Save submission
# -------------------------------
subm = pd.DataFrame(test_preds, columns=target_cols)
subm.insert(0, "ID", test["ID"])
subm.to_csv(OUT_PATH, index=False)
print(f"Submission saved to {OUT_PATH}")

# Optional: print OOF CV RMSE per target
from sklearn.metrics import mean_squared_error
for i, tgt in enumerate(target_cols):
    rmse = mean_squared_error(train[tgt], oof_preds[:, i], squared=False)
    print(f"{tgt:<15}: CV RMSE = {rmse:.4f}")


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.5.0
    Uninstalling lightgbm-4.5.0:
      Successfully uninstalled lightgbm-4.5.0
Successfully installed lightgbm-4.6.0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16759
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 70
[LightGBM] [Info] Start training from score -0.021847


In [None]:
"""
predict_blends_lgbm.py
Score‑oriented pipeline for BlendProperty prediction.
▶ Requires: lightgbm, pandas, numpy, scikit‑learn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
!pip install lightgbm
!pip install --upgrade lightgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor

# -------------------------------
# 1. Load data
# -------------------------------
TRAIN_PATH = "/bin/testing1/train.csv"
TEST_PATH = "/bin/testing1/test.csv"
OUT_PATH = "/bin/testing1/predicted_solution.csv"


train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

target_cols  = [c for c in train.columns if c.startswith("BlendProperty")]
feature_cols = [c for c in train.columns if c not in target_cols + ["ID"]]

X = train[feature_cols]
X_test = test[feature_cols]

# -------------------------------
# 2. Basic feature engineering
#    (fast but effective)
# -------------------------------
def add_interactions(df):
    frac_cols     = [c for c in df.columns if "fraction"  in c]
    prop1_cols    = [c for c in df.columns if "Property1" in c]
    # Pairwise products of fractions
    for i, c1 in enumerate(frac_cols):
        for c2 in frac_cols[i+1:]:
            df[f"{c1}*{c2}"] = df[c1] * df[c2]
    # Product of fraction and its Property1
    for fc, pc in zip(frac_cols, prop1_cols):
        df[f"{fc}x{pc}"] = df[fc] * df[pc]
    return df

X       = add_interactions(X.copy())
X_test  = add_interactions(X_test.copy())
feature_cols = X.columns  # update to include new features

# -------------------------------
# 3. Cross‑validated training
# -------------------------------
N_SPLITS   = 5
SEED_LIST  = [0, 1, 2]      # ensembling seeds
oof_preds  = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test),  len(target_cols)))

for seed in SEED_LIST:
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    for t_idx, target in enumerate(target_cols):
        fold_test_preds = np.zeros(len(test))
        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            model = LGBMRegressor(
                n_estimators=4000,
                learning_rate=0.015,
                num_leaves=255,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed + fold,
                objective="regression",
                metric="rmse",
                n_jobs=-1
            )

            model.fit(
                    X.iloc[trn_idx], train[target].iloc[trn_idx],
                    eval_set=[(X.iloc[val_idx], train[target].iloc[val_idx])],
                    eval_metric="rmse"
                )

            # Out‑of‑fold
            oof_preds[val_idx, t_idx] = model.predict(X.iloc[val_idx])
            # Accumulate test
            fold_test_preds += model.predict(X_test) / N_SPLITS
        test_preds[:, t_idx] += fold_test_preds / len(SEED_LIST)

# -------------------------------
# 4. Save submission
# -------------------------------
subm = pd.DataFrame(test_preds, columns=target_cols)
subm.insert(0, "ID", test["ID"])
subm.to_csv(OUT_PATH, index=False)
print(f"Submission saved to {OUT_PATH}")

# # Optional: print OOF CV RMSE per target
# from sklearn.metrics import mean_squared_error
# for i, tgt in enumerate(target_cols):
#     rmse = mean_squared_error(train[tgt], oof_preds[:, i], squared=False)
#     print(f"{tgt:<15}: CV RMSE = {rmse:.4f}")


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data():
    """Load and preprocess the data with basic feature engineering"""
    print("Loading data...")
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    sample_solution = pd.read_csv('sample_solution.csv')

    # Basic data info
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"Missing values in train: {train.isnull().sum().sum()}")
    print(f"Missing values in test: {test.isnull().sum().sum()}")

    return train, test, sample_solution

def feature_engineering(X_train, X_test):
    """Advanced feature engineering"""
    print("Performing feature engineering...")

    # Combine train and test for consistent preprocessing
    combined = pd.concat([X_train, X_test], axis=0, ignore_index=True)

    # Numerical features
    numerical_cols = combined.select_dtypes(include=[np.number]).columns.tolist()

    # Create interaction features for top numerical features
    if len(numerical_cols) >= 2:
        # Add polynomial features for top 5 numerical columns
        top_num_cols = numerical_cols[:5]
        for i, col1 in enumerate(top_num_cols):
            for col2 in top_num_cols[i+1:]:
                combined[f'{col1}_{col2}_interaction'] = combined[col1] * combined[col2]
                combined[f'{col1}_{col2}_ratio'] = combined[col1] / (combined[col2] + 1e-8)

    # Statistical features
    if len(numerical_cols) > 0:
        combined['num_features_sum'] = combined[numerical_cols].sum(axis=1)
        combined['num_features_mean'] = combined[numerical_cols].mean(axis=1)
        combined['num_features_std'] = combined[numerical_cols].std(axis=1)
        combined['num_features_skew'] = combined[numerical_cols].skew(axis=1)

    # Handle categorical features
    categorical_cols = combined.select_dtypes(include=['object', 'category']).columns.tolist()

    # Frequency encoding for categorical features
    for col in categorical_cols:
        freq_map = combined[col].value_counts().to_dict()
        combined[f'{col}_freq'] = combined[col].map(freq_map)

    # Target encoding placeholder (will be done during CV)
    target_encoded_cols = []
    for col in categorical_cols:
        target_encoded_cols.append(f'{col}_target_encoded')
        combined[f'{col}_target_encoded'] = 0  # Placeholder

    # Split back
    X_train_fe = combined.iloc[:len(X_train)].copy()
    X_test_fe = combined.iloc[len(X_train):].copy().reset_index(drop=True)

    return X_train_fe, X_test_fe, categorical_cols, target_encoded_cols

def target_encoding(X_train, y_train, X_val, categorical_cols, smoothing=10):
    """Apply target encoding with smoothing"""
    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()

    for col in categorical_cols:
        # Calculate global mean
        global_mean = y_train.mean()

        # Calculate category means and counts
        category_stats = y_train.groupby(X_train[col]).agg(['mean', 'count'])

        # Apply smoothing
        smoothed_means = (category_stats['mean'] * category_stats['count'] +
                         global_mean * smoothing) / (category_stats['count'] + smoothing)

        # Map to train and validation
        X_train_encoded[f'{col}_target_encoded'] = X_train[col].map(smoothed_means).fillna(global_mean)
        X_val_encoded[f'{col}_target_encoded'] = X_val[col].map(smoothed_means).fillna(global_mean)

    return X_train_encoded, X_val_encoded

def optimize_hyperparameters(X, y, categorical_cols, n_trials=100):
    """Optimize hyperparameters using Optuna"""
    print("Optimizing hyperparameters...")

    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 500, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 4, 10),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
            'border_count': trial.suggest_int('border_count', 32, 255),
            'random_seed': 42,
            'verbose': False
        }

        model = CatBoostClassifier(**params)

        # Cross-validation
        cv_scores = cross_val_score(
            model, X, y,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='accuracy',
            n_jobs=-1
        )

        return cv_scores.mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"Best score: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")

    return study.best_params

def train_model_with_cv(X, y, categorical_cols, best_params, n_splits=5):
    """Train model with cross-validation and ensemble"""
    print("Training model with cross-validation...")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    models = []
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Training fold {fold + 1}/{n_splits}")

        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y.iloc[train_idx]
        y_val_fold = y.iloc[val_idx]

        # Apply target encoding
        X_train_encoded, X_val_encoded = target_encoding(
            X_train_fold, y_train_fold, X_val_fold, categorical_cols
        )

        # Train model
        model = CatBoostClassifier(**best_params, verbose=False)
        model.fit(
            X_train_encoded, y_train_fold,
            eval_set=(X_val_encoded, y_val_fold),
            cat_features=categorical_cols,
            early_stopping_rounds=50,
            plot=False
        )

        # Validate
        val_preds = model.predict(X_val_encoded)
        accuracy = accuracy_score(y_val_fold, val_preds)
        cv_scores.append(accuracy)
        models.append(model)

        print(f"Fold {fold + 1} accuracy: {accuracy:.4f}")

    print(f"Average CV accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")

    return models, cv_scores

def make_ensemble_predictions(models, X_test, categorical_cols):
    """Make ensemble predictions"""
    print("Making ensemble predictions...")

    all_predictions = []

    for i, model in enumerate(models):
        # For test predictions, we need to handle target encoding
        # Using global mean as fallback for unseen categories
        X_test_encoded = X_test.copy()

        # Set target encoded features to 0 (will be handled by model)
        for col in categorical_cols:
            if f'{col}_target_encoded' in X_test_encoded.columns:
                X_test_encoded[f'{col}_target_encoded'] = 0

        preds = model.predict(X_test_encoded)
        all_predictions.append(preds)

    # Ensemble by majority voting
    ensemble_preds = np.round(np.mean(all_predictions, axis=0)).astype(int)

    return ensemble_preds

def main():
    # Load and preprocess data
    train, test, sample_solution = load_and_preprocess_data()

    # Prepare features and target
    target_col = 'target'  # Adjust if different
    if target_col not in train.columns:
        # Try to find target column
        possible_targets = ['target', 'label', 'class', 'y']
        for col in possible_targets:
            if col in train.columns:
                target_col = col
                break

    X = train.drop(columns=[target_col])
    y = train[target_col]

    # Feature engineering
    X_fe, test_fe, categorical_cols, target_encoded_cols = feature_engineering(X, test)

    print(f"Features after engineering: {X_fe.shape[1]}")
    print(f"Categorical features: {len(categorical_cols)}")

    # Optimize hyperparameters
    best_params = optimize_hyperparameters(X_fe, y, categorical_cols, n_trials=50)

    # Train with cross-validation
    models, cv_scores = train_model_with_cv(X_fe, y, categorical_cols, best_params)

    # Make ensemble predictions
    test_preds = make_ensemble_predictions(models, test_fe, categorical_cols)

    # Prepare submission
    submission = sample_solution.copy()
    submission.iloc[:, 1] = test_preds

    # Save submission
    submission.to_csv('optimized_catboost_submission.csv', index=False)
    print("Submission saved as optimized_catboost_submission.csv")

    # Feature importance from the first model
    feature_importance = models[0].get_feature_importance()
    feature_names = X_fe.columns

    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)

    print("\nTop 10 most important features:")
    print(importance_df.head(10))

    return models, cv_scores, importance_df

if __name__ == "__main__":
    models, cv_scores, importance_df = main()