In [5]:
!pip install optuna --quiet

import os
import subprocess
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
import zipfile
import optuna

def run_shell_command(command):
    try:
        print(f"Executing: {command}")
        result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
        print(result.stdout)
        if result.stderr:
            print(f"Stderr: {result.stderr}")
    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {command}")
        print(f"Return code: {e.returncode}")
        print(f"Output: {e.output}")
        print(f"Stderr: {e.stderr}")
        raise

def setup_and_download_kaggle_data():
    kaggle_json_path = 'kaggle.json'
    if not os.path.exists(kaggle_json_path):
        print(f"'{kaggle_json_path}' not found. Please upload your Kaggle API token file.")
        try:
            from google.colab import files
            uploaded = files.upload()
            if kaggle_json_path not in uploaded:
                print(f"Upload failed or '{kaggle_json_path}' not uploaded. Please try again.")
                return False
            print(f"'{kaggle_json_path}' uploaded successfully.")
        except ImportError:
            print("Not in Colab environment or file upload failed. Make sure 'kaggle.json' is in the current directory.")
            return False

    print("Configuring Kaggle API...")
    os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
    run_shell_command(f'cp {kaggle_json_path} {os.path.expanduser("~/.kaggle/")}')
    run_shell_command(f'chmod 600 {os.path.expanduser("~/.kaggle/kaggle.json")}')

    competition_name = 'weekly-ml-challenge-2'
    data_dir = 'crab_data'
    os.makedirs(data_dir, exist_ok=True)

    print(f"Downloading data for competition: {competition_name}")
    try:
        run_shell_command(f'kaggle competitions download -c {competition_name} -p {data_dir} --force')
    except Exception as e:
        print(f"Failed to download data. Check if you've accepted the competition rules on Kaggle: https://www.kaggle.com/competitions/{competition_name}/rules")
        raise e

    zip_file_path = os.path.join(data_dir, f'{competition_name}.zip')
    if os.path.exists(zip_file_path):
        print(f"Unzipping {zip_file_path}...")
        try:
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(data_dir)
            print("Unzipping complete.")
        except zipfile.BadZipFile:
            print(f"Error: The downloaded file {zip_file_path} is not a valid zip file or is corrupted.")
            return False
        except Exception as e:
            print(f"An error occurred during unzipping: {e}")
            return False
    else:
        print(f"Error: Zip file {zip_file_path} not found after download attempt.")
        return False
    return True

def engineer_features_combined(df_input):
    df = df_input.copy()

    length_col = 'Length'
    diameter_col = 'Diameter'
    height_col = 'Height'
    whole_weight_col = 'Weight'
    shucked_weight_col = 'Shucked Weight'
    viscera_weight_col = 'Viscera Weight'
    shell_weight_col = 'Shell Weight'

    original_measurement_cols = [length_col, diameter_col, height_col,
                                 whole_weight_col, shucked_weight_col,
                                 viscera_weight_col, shell_weight_col]

    missing_cols = [col for col in original_measurement_cols if col not in df.columns]
    if missing_cols:
        raise KeyError(f"Feature Engineering Error: Missing expected original columns: {missing_cols}. "
                       f"Available columns are: {df.columns.tolist()}. "
                       f"Please check the column name definitions at the start of 'engineer_features_combined'.")

    df['Approx_Volume'] = df[length_col] * df[diameter_col] * df[height_col]
    df['Approx_Density'] = df[whole_weight_col] / (df['Approx_Volume'] + 1e-6)
    df['Length_to_Diameter'] = df[length_col] / (df[diameter_col] + 1e-6)
    df['Height_to_Diameter'] = df[height_col] / (df[diameter_col] + 1e-6)
    df['Meat_Ratio'] = df[shucked_weight_col] / (df[whole_weight_col] + 1e-6)
    df['Viscera_to_Whole_Weight'] = df[viscera_weight_col] / (df[whole_weight_col] + 1e-6)
    df['Shell_to_Whole_Weight'] = df[shell_weight_col] / (df[whole_weight_col] + 1e-6)
    df['Shell_Thickness_Proxy'] = df[shell_weight_col] / (df['Approx_Volume'] + 1e-6)
    df['Non_Meat_Weight'] = df[whole_weight_col] - df[shucked_weight_col]
    df['Shell_Est_Weight'] = df[whole_weight_col] - df[shucked_weight_col] - df[viscera_weight_col]
    df['BMI_like'] = df[whole_weight_col] / (df[height_col]**2 + 1e-6)
    df['Sum_Internal_Weights'] = df[shucked_weight_col] + df[viscera_weight_col]

    for col_name in original_measurement_cols:
        df[f'{col_name}_sq'] = df[col_name]**2

    df['Length_Height_Interaction'] = df[length_col] * df[height_col]
    df['Weight_Density_Interaction'] = df[whole_weight_col] * df['Approx_Density']

    key_features_for_poly = [length_col, diameter_col, height_col, whole_weight_col]

    missing_poly_keys = [col for col in key_features_for_poly if col not in df.columns]
    if missing_poly_keys:
        raise KeyError(f"Feature Engineering Error (Polynomial): Missing keys for polynomial features: {missing_poly_keys}. "
                       f"Available columns: {df.columns.tolist()}")

    poly_transformer = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    poly_features_array = poly_transformer.fit_transform(df[key_features_for_poly])
    poly_feature_names = poly_transformer.get_feature_names_out(input_features=key_features_for_poly)
    new_poly_features_df = pd.DataFrame(poly_features_array, columns=poly_feature_names, index=df.index)
    df = pd.concat([df, new_poly_features_df], axis=1)
    df = df.loc[:,~df.columns.duplicated(keep='first')]

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in df.columns:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())
            if df[col].isnull().any():
                 df[col] = df[col].fillna(0)
    return df

X_train_for_optuna = None
y_train_for_optuna = None

def objective(trial):
    global X_train_for_optuna, y_train_for_optuna

    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 5.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 5.0, log=True),
        'random_state': 42,
        'tree_method': 'gpu_hist',
        'device': 'cuda'
    }

    model = xgb.XGBRegressor(**params)
    cv_strategy = KFold(n_splits=5, shuffle=True, random_state=trial.number + 42)
    scores = cross_val_score(model, X_train_for_optuna, y_train_for_optuna, cv=cv_strategy,
                            scoring='neg_root_mean_squared_error', n_jobs=-1)
    return -scores.mean()

def run_crab_age_prediction_optuna():
    global X_train_for_optuna, y_train_for_optuna

    data_dir = 'crab_data'
    train_file = os.path.join(data_dir, 'train.csv')
    test_file = os.path.join(data_dir, 'test.csv')
    sample_submission_file = os.path.join(data_dir, 'sample_submission.csv')

    if not all(os.path.exists(f) for f in [train_file, test_file, sample_submission_file]):
        print("One or more data files not found in 'crab_data' directory.")
        return

    print("Loading data...")
    train_df_orig = pd.read_csv(train_file)
    test_df_orig = pd.read_csv(test_file)

    print("Original train_df columns:", train_df_orig.columns.tolist())
    print("Original test_df columns:", test_df_orig.columns.tolist())

    submission_df_template = pd.read_csv(sample_submission_file)
    test_ids = test_df_orig['id']

    print("Preprocessing and Feature Engineering...")
    y_train_for_optuna = train_df_orig['Age']

    X_train_raw = train_df_orig.drop(['id', 'Age'], axis=1, errors='ignore')
    X_test_raw = test_df_orig.drop('id', axis=1, errors='ignore')

    train_len = len(X_train_raw)
    combined_df_raw = pd.concat([X_train_raw, X_test_raw], ignore_index=True)

    combined_df_processed = pd.get_dummies(combined_df_raw, columns=['Sex'], prefix='Sex', dummy_na=False)
    combined_df_engineered = engineer_features_combined(combined_df_processed)

    X_train_engineered = combined_df_engineered.iloc[:train_len]
    X_test_engineered = combined_df_engineered.iloc[train_len:]

    train_cols = X_train_engineered.columns.tolist()
    X_test_engineered = X_test_engineered.reindex(columns=train_cols, fill_value=0)
    X_train_engineered = X_train_engineered[train_cols]

    X_train_for_optuna = X_train_engineered

    print("Engineered train data shape:", X_train_for_optuna.shape)
    print("Engineered test data shape:", X_test_engineered.shape)
    print(f"Number of features: {X_train_for_optuna.shape[1]}")

    print("Starting Hyperparameter Tuning with Optuna for XGBoost (GPU)...")
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=123))
    study.optimize(objective, n_trials=30)

    print("\nOptuna Study Statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Best trial:")
    best_trial = study.best_trial
    print("    Value (RMSE): ", best_trial.value)
    print("    Params: ")
    for key, value in best_trial.params.items():
        print(f"      {key}: {value}")

    best_params = best_trial.params
    best_params['objective'] = 'reg:squarederror'
    best_params['random_state'] = 42
    best_params['tree_method'] = 'gpu_hist'
    best_params['device'] = 'cuda'

    print("\nTraining final XGBoost model with best parameters on full training data (GPU)...")
    final_xgb_model = xgb.XGBRegressor(**best_params)
    final_xgb_model.fit(X_train_for_optuna, y_train_for_optuna)

    print("Making predictions with the final model...")
    predictions_float = final_xgb_model.predict(X_test_engineered)

    min_target_observed = y_train_for_optuna.min()
    lower_bound_target = max(1, int(min_target_observed))

    predictions_clipped = np.clip(predictions_float, lower_bound_target, None)
    predictions_int = np.round(predictions_clipped).astype(int)

    print("Creating submission file...")
    submission_output_df = pd.DataFrame({
        'id': test_ids,
        'Yield': predictions_int
    })
    submission_output_df['id'] = submission_output_df['id'].astype(submission_df_template['id'].dtype)

    submission_file_path = 'submission_yield_gpu.csv'
    submission_output_df.to_csv(submission_file_path, index=False)
    print(f"Submission file created: {submission_file_path}")
    print(submission_output_df.head())

    try:
        from google.colab import files
        files.download(submission_file_path)
        print(f"'{submission_file_path}' prepared for download.")
    except ImportError:
        print(f"Not in Colab. '{submission_file_path}' is saved in the current directory.")

if __name__ == '__main__':
    if setup_and_download_kaggle_data():
        run_crab_age_prediction_optuna()
    else:
        print("Failed to setup Kaggle data. ML workflow aborted.")

Configuring Kaggle API...
Executing: cp kaggle.json /root/.kaggle/

Executing: chmod 600 /root/.kaggle/kaggle.json

Downloading data for competition: weekly-ml-challenge-2
Executing: kaggle competitions download -c weekly-ml-challenge-2 -p crab_data --force
Downloading weekly-ml-challenge-2.zip to crab_data


Stderr: 
  0%|          | 0.00/551k [00:00<?, ?B/s]
100%|██████████| 551k/551k [00:00<00:00, 739MB/s]

Unzipping crab_data/weekly-ml-challenge-2.zip...
Unzipping complete.
Loading data...
Original train_df columns: ['id', 'Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Age']
Original test_df columns: ['id', 'Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']
Preprocessing and Feature Engineering...
Engineered train data shape: (15000, 37)
Engineered test data shape: (10000, 37)
Number of features: 37
Starting Hyperparameter Tuning with Optuna for XGBoost (GPU)...





Optuna Study Statistics: 
  Number of finished trials:  30
  Best trial:
    Value (RMSE):  1.9535981624401895
    Params: 
      n_estimators: 814
      learning_rate: 0.016080594505164823
      max_depth: 4
      subsample: 0.928925626790364
      colsample_bytree: 0.8798211836331428
      gamma: 3.6207922246733614e-07
      reg_alpha: 0.3809901992098999
      reg_lambda: 4.825132823630822e-07

Training final XGBoost model with best parameters on full training data (GPU)...



    E.g. tree_method = "hist", device = "cuda"



Making predictions with the final model...
Creating submission file...
Submission file created: submission_yield_gpu.csv
      id  Yield
0  15000     15
1  15001     11
2  15002      9
3  15003     10
4  15004     12



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'submission_yield_gpu.csv' prepared for download.
