In [1]:
!pip install sentence-transformers lightgbm --quiet

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# --- Path Configuration for Kaggle ---

# Add the directory containing your helper scripts to the Python path
sys.path.append('/kaggle/input/helpers')

# Now you can import your custom classes
from cat_transform import CatalogDataTransformer
from catalog_feature import CatalogFeatureExtractor
from img_transform import process_images_in_parallel, transform_pipeline
from img_feature import CLIPImageFeatureExtractor

# Input data paths
TRAIN_CSV_PATH = "/kaggle/input/amazon-ml-data/student_resource/dataset/train.csv"
TEST_CSV_PATH = "/kaggle/input/amazon-ml-data/student_resource/dataset/test.csv"

# Writable directory for outputs
KAGGLE_WORKING_DIR = "/kaggle/working"
ALL_IMAGES_DIR = os.path.join(KAGGLE_WORKING_DIR,'all_processed_images')
TRANSFORMED_DATA_DIR = os.path.join(KAGGLE_WORKING_DIR, 'transformed_data')

2025-10-12 10:47:13.743640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760266033.900217     310 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760266033.950121     310 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# -- 1. Load and Combine Datasets --
print("Step 1: Loading and combining train and test data...")
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)

# Store sample IDs for later splitting
train_ids = train_df['sample_id']
test_ids = test_df['sample_id']
# Store the target variable separately
y_train_raw = train_df['price']

# Combine for unified processing
all_df = pd.concat([
    train_df.drop(columns=['price']),
    test_df
], ignore_index=True)

print(f"Combined DataFrame shape: {all_df.shape}")

Step 1: Loading and combining train and test data...
Combined DataFrame shape: (150000, 3)


In [None]:
# -- 2. Process All Images Simultaneously --
print("\nStep 2: Downloading and transforming all images...")
process_images_in_parallel(
    image_links=all_df['image_link'].dropna().tolist(),
    download_folder=ALL_IMAGES_DIR,
    transform=transform_pipeline
)
all_df['image_path'] = all_df['image_link'].apply(
    lambda url: os.path.join(ALL_IMAGES_DIR, os.path.basename(url)) if isinstance(url, str) else None
)

# Verify if images actually exist
existing_images = all_df['image_path'].apply(lambda x: os.path.exists(x) if x else False)
print(f"Found {existing_images.sum()} existing images out of {len(all_df)} total rows")


Found 149998 existing images out of 150000 total rows


In [5]:
# -- 3. Extract Catalog & Text Features --
print("\nStep 3: Processing catalog content and extracting text features...")
# Instantiate transformers
catalog_transformer = CatalogDataTransformer()
text_feature_extractor = CatalogFeatureExtractor()

# Fit the catalog transformer ONLY on the training data to prevent data leakage
catalog_transformer.fit(train_df)

# Transform the ENTIRE combined dataset
processed_all_df = catalog_transformer.transform(all_df)

# Extract text features from the transformed combined dataset
catalog_features_df = text_feature_extractor.extract_features(processed_all_df)
# Drop the raw text column as it's no longer needed
catalog_features_df = catalog_features_df.drop(columns=['all_text'])



Step 3: Processing catalog content and extracting text features...
Loading SentenceTransformer model: all-MiniLM-L6-v2...




Model loaded successfully.
Fitting CatalogDataTransformer on training data...
Learned median 'Value' for imputation: 16.0
Transforming catalog data... (DataFrame shape: (150000, 4))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  transformed_df['Value'].fillna(self.median_value, inplace=True)


Transformation complete.
Extracting text features...
Generating text embeddings... (This may take a moment)


Batches:   0%|          | 0/4688 [00:00<?, ?it/s]

Text embeddings generated.
Feature extraction complete. Final DataFrame shape: (150000, 398)


In [6]:
# -- 4. Extract Image Features (Memory-Efficient Version) --
print("\nStep 4: Extracting image features using CLIP...")
clip_extractor = CLIPImageFeatureExtractor()

# Define a chunk size
CHUNK_SIZE = 25000
num_chunks = int(np.ceil(len(all_df) / CHUNK_SIZE))
all_image_features = [] # List to store feature DataFrames from each chunk

print(f"Processing {len(all_df)} images in {num_chunks} chunks of size {CHUNK_SIZE}...")

for i in range(num_chunks):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(all_df))
    
    print(f"--- Processing chunk {i+1}/{num_chunks} (rows {start_idx} to {end_idx-1}) ---")
    
    # Get the current chunk of the dataframe
    df_chunk = all_df.iloc[start_idx:end_idx]
    
    # Extract features for this chunk
    chunk_features_df = clip_extractor.extract_features(df_chunk, image_path_col='image_path')
    
    # Append the result to our list
    all_image_features.append(chunk_features_df)
    
    # Optional: clean up memory
    import gc
    gc.collect()

print("\nCombining features from all chunks...")
# Concatenate all the feature dataframes into one
image_features_df = pd.concat(all_image_features, ignore_index=True)

# Keep only sample_id and image features for merging
image_feature_cols = ['sample_id'] + [col for col in image_features_df.columns if 'img_feat' in col]
image_features_df = image_features_df[image_feature_cols]

print("✅ Image feature extraction complete.")


Step 4: Extracting image features using CLIP...
Loading CLIP model: clip-ViT-B-32...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIP model loaded successfully.
Processing 150000 images in 6 chunks of size 25000...
--- Processing chunk 1/6 (rows 0 to 24999) ---
Preparing images for feature extraction...


Loading images: 100%|██████████| 25000/25000 [01:54<00:00, 218.74it/s]


Encoding images to feature vectors...





Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Image feature extraction complete. Shape: (25000, 513)
--- Processing chunk 2/6 (rows 25000 to 49999) ---
Preparing images for feature extraction...


Loading images: 100%|██████████| 25000/25000 [01:50<00:00, 226.94it/s]


Encoding images to feature vectors...





Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Image feature extraction complete. Shape: (25000, 513)
--- Processing chunk 3/6 (rows 50000 to 74999) ---
Preparing images for feature extraction...


Loading images: 100%|██████████| 25000/25000 [01:50<00:00, 227.20it/s]


Encoding images to feature vectors...





Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Image feature extraction complete. Shape: (25000, 513)
--- Processing chunk 4/6 (rows 75000 to 99999) ---
Preparing images for feature extraction...


Loading images: 100%|██████████| 25000/25000 [01:51<00:00, 224.60it/s]



Encoding images to feature vectors...


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Image feature extraction complete. Shape: (25000, 513)
--- Processing chunk 5/6 (rows 100000 to 124999) ---
Preparing images for feature extraction...


Loading images: 100%|██████████| 25000/25000 [01:51<00:00, 224.16it/s]


Encoding images to feature vectors...





Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Image feature extraction complete. Shape: (25000, 513)
--- Processing chunk 6/6 (rows 125000 to 149999) ---
Preparing images for feature extraction...


Loading images: 100%|██████████| 25000/25000 [01:52<00:00, 221.39it/s]


Encoding images to feature vectors...





Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Image feature extraction complete. Shape: (25000, 513)

Combining features from all chunks...
✅ Image feature extraction complete.


In [7]:
# -- 5. Merge, Scale, and Finalize Features --
print("\nStep 5: Merging all features into a final matrix...")
# Merge catalog and image features
final_features_df = pd.merge(catalog_features_df, image_features_df, on='sample_id', how='left')

# Fill any NaNs in image features (from failed merges) with 0
img_cols = [col for col in final_features_df.columns if 'img_feat' in col]
final_features_df[img_cols] = final_features_df[img_cols].fillna(0)



Step 5: Merging all features into a final matrix...


In [8]:
# -- 6. Split back into Train and Test sets --
print("\nStep 6: Splitting combined data back into Train and Test sets...")
# Use the stored sample_ids to perform a clean split
train_mask = final_features_df['sample_id'].isin(train_ids)
X = final_features_df.drop(columns=['sample_id'])

X_train = X[train_mask].reset_index(drop=True)
X_test = X[~train_mask].reset_index(drop=True)
y_train = np.log1p(y_train_raw) # Apply log transform to the target

# Store test_ids for final submission (from the original test_df)
test_ids_final = test_df['sample_id'].values


Step 6: Splitting combined data back into Train and Test sets...


In [9]:
# -- 7. Scale Numerical Features --
print("\nStep 7: Applying StandardScaler...")
scaler = StandardScaler()
columns_to_scale = ['Value', 'IPQ']

# Fit scaler ONLY on training data
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Transform test data using the scaler fitted on training data
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


# -- Final Verification --
print("\n--- Feature Extraction Complete ---")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

# Check for any NaN values
print(f"NaN values in X_train: {X_train.isna().sum().sum()}")
print(f"NaN values in X_test: {X_test.isna().sum().sum()}")

assert X_train.shape[1] == X_test.shape[1], f"CRITICAL ERROR: Train ({X_train.shape[1]}) and test ({X_test.shape[1]}) sets have different feature dimensions!"
print("\n✅ Success! X_train and X_test have the same feature dimensions.")


Step 7: Applying StandardScaler...

--- Feature Extraction Complete ---
Shape of X_train: (75000, 908)
Shape of y_train: (75000,)
Shape of X_test: (75000, 908)
NaN values in X_train: 0
NaN values in X_test: 0

✅ Success! X_train and X_test have the same feature dimensions.


In [10]:
import os

# Define the directory to save the processed data
SAVE_DIR = 'transformed_data'

# Create the directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"Directory '{SAVE_DIR}' created or already exists.")

# Define file paths
x_train_path = os.path.join(SAVE_DIR, 'X_train.parquet')
y_train_path = os.path.join(SAVE_DIR, 'y_train.parquet')
x_test_path = os.path.join(SAVE_DIR, 'X_test.parquet')

# --- Save the DataFrames to Parquet files ---

# Save X_train
print(f"Saving X_train to '{x_train_path}'...")
X_train.to_parquet(x_train_path)

# Save y_train (converting Series to DataFrame for saving)
print(f"Saving y_train to '{y_train_path}'...")
y_train.to_frame(name='price_log1p').to_parquet(y_train_path)

# Save X_test
print(f"Saving X_test to '{x_test_path}'...")
X_test.to_parquet(x_test_path)

print("\n✅ All processed data has been successfully saved.")

Directory 'transformed_data' created or already exists.
Saving X_train to 'transformed_data/X_train.parquet'...
Saving y_train to 'transformed_data/y_train.parquet'...
Saving X_test to 'transformed_data/X_test.parquet'...

✅ All processed data has been successfully saved.


In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import os

In [None]:
# # -- 1. Load the Processed Data --
# print("Loading pre-processed data...")
# DATA_DIR = 'transformed_data'
# X_train = pd.read_parquet(os.path.join(DATA_DIR, 'X_train.parquet'))
# y_train = pd.read_parquet(os.path.join(DATA_DIR, 'y_train.parquet')).squeeze()
# X_test = pd.read_parquet(os.path.join(DATA_DIR, 'X_test.parquet'))
# test_ids = pd.read_csv("/content/student_resource/dataset/test.csv")['sample_id']



In [16]:
# -- 2. Set up the Model and Cross-Validation --
print("\nSetting up LightGBM model and 5-Fold Cross-Validation...")

# LightGBM model parameters (these can be tuned for better performance)
params = {
    'objective': 'regression_l1', # MAE is often more robust to outliers
    'metric': 'rmse',
    'device': 'gpu',
    'n_estimators': 20000,        # High number, will be stopped by early stopping
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,                 # Use all available cores
    'seed': 42,
    'boosting_type': 'gbdt',
}

# K-Fold setup
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Arrays to store predictions
oof_predictions = np.zeros(X_train.shape[0])
test_predictions = np.zeros(X_test.shape[0])
fold_scores = []



Setting up LightGBM model and 5-Fold Cross-Validation...


In [18]:
# --- Convert data for optimal GPU performance ---
print("Converting data to float32 for GPU efficiency...")
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Ensure the target variable is also in a compatible format
# y_train is likely a pandas Series, converting its values is good practice
y_train = y_train.astype(np.float32)

print("✅ Data conversion complete.")


# -- 3. Run the Cross-Validation Loop --
print(f"Starting training with {N_SPLITS}-fold CV...")
# (Your existing training loop code follows)

Converting data to float32 for GPU efficiency...
✅ Data conversion complete.
Starting training with 5-fold CV...


In [19]:
# -- 3. Run the Cross-Validation Loop --
print(f"Starting training with {N_SPLITS}-fold CV...")
for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")

    # Split the data for this fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Initialize and train the model
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train_fold, y_train_fold,
              eval_set=[(X_val_fold, y_val_fold)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    # Make predictions
    val_preds = model.predict(X_val_fold)
    oof_predictions[val_index] = val_preds
    test_predictions += model.predict(X_test) / N_SPLITS # Average predictions over folds

    # Evaluate fold performance
    fold_rmse = np.sqrt(mean_squared_error(y_val_fold, val_preds))
    fold_scores.append(fold_rmse)
    print(f"Fold {fold+1} RMSE: {fold_rmse:.5f}")

Starting training with 5-fold CV...
--- Fold 1/5 ---
Fold 1 RMSE: 0.71257
--- Fold 2/5 ---
Fold 2 RMSE: 0.69339
--- Fold 3/5 ---
Fold 3 RMSE: 0.69236
--- Fold 4/5 ---
Fold 4 RMSE: 0.67930
--- Fold 5/5 ---
Fold 5 RMSE: 0.69258


In [20]:
# -- 4. Evaluate Overall Performance and Create Submission --
print("\n--- Training Complete ---")
mean_cv_score = np.mean(fold_scores)
print(f"Average CV RMSE across all folds: {mean_cv_score:.5f}")

# Inverse transform the predictions from log scale back to price scale
final_predictions = np.expm1(test_predictions)

# Create submission file
submission_df = pd.DataFrame({'sample_id': test_ids, 'price': final_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\n✅ Submission file 'submission.csv' has been created successfully!")
print(f"Submission shape: {submission_df.shape}")
print("Submission file head:")
print(submission_df.head())


--- Training Complete ---
Average CV RMSE across all folds: 0.69404

✅ Submission file 'submission.csv' has been created successfully!
Submission shape: (75000, 2)
Submission file head:
   sample_id      price
0     100179  15.564337
1     245611  18.533813
2     146263  18.395008
3      95658   8.395778
4      36806  24.383568


In [21]:
# The submission_df should already be created from the previous step.
# If not, ensure the model has been trained and predictions have been made.

# Define the filename for the submission
submission_filename = 'submission.csv'

# Save the DataFrame to a CSV file
# The index=False argument is crucial to prevent pandas from writing the
# DataFrame's index as an extra column in the file.
submission_df.to_csv(submission_filename, index=False)

print(f"✅ Submission file saved successfully as '{submission_filename}'!")

✅ Submission file saved successfully as 'submission.csv'!


In [22]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  75000 non-null  int64  
 1   price      75000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.1 MB
