In [1]:
import re
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from category_encoders import TargetEncoder
import lightgbm as lgb
from tqdm import tqdm

# GPU Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Text Cleaning Function
def clean_text(text):
    """
    Cleans text while preserving numeric values and measurement units.
    Handles both imperial and metric unit formats.
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs and HTML tags
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    
    # Preserve numbers, decimals, and units while removing special characters
    # Allow: letters, numbers, spaces, ., ,, ", ', ° (for degree symbols)
    text = re.sub(r"[^a-zA-Z0-9\s.,'\"°]", ' ', text)
    
    # Standardize unit formats
    text = re.sub(r'(\d)\s?(inches|inch|in|")', r'\1inch', text)  # imperial
    text = re.sub(r'(\d)\s?(centimeters|centimetres|cm)', r'\1cm', text)  # metric
    
    # Handle comma decimal separators (e.g., 20,5cm -> 20.5cm)
    text = re.sub(r'(\d+),(\d+)', r'\1.\2', text)
    
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Load and preprocess data
train = pd.read_csv('train.csv')
test = train[:20000]
train = train[20000:]
print(train.shape)
print(test.shape)
# Clean text fields
for df in [train, test]:
    for col in ['TITLE', 'DESCRIPTION', 'BULLET_POINTS']:
        df[col] = df[col].fillna('').apply(clean_text)
    df['text'] = df['TITLE'] + ' ' + df['DESCRIPTION'] + ' ' + df['BULLET_POINTS']

# Length extraction with enhanced regex
def extract_length(text):
    # Capture various measurement patterns including decimals and fractions
    matches = re.findall(
        r'(\d+\.?\d*)\s?'
        r'(?:cm|centimeter|centimetre|inch|inches|in|"|´|´´|″|’|″|′|ft|feet|foot|'
        r'centímetros|pulgadas|zoll|centimètres|pouces)', 
        text, re.IGNORECASE
    )
    lengths = []
    for val in [m[0] for m in matches]:
        try:
            # Handle fractional values (e.g., 20 1/2 -> 20.5)
            if '/' in val:
                parts = val.split()
                if len(parts) > 1:
                    whole, fraction = parts
                    num, den = fraction.split('/')
                    val = float(whole) + (float(num)/float(den))
                else:
                    num, den = val.split('/')
                    val = float(num)/float(den)
            else:
                val = float(val)
            lengths.append(val)
        except:
            continue
    return np.median(lengths) if lengths else np.nan

# Apply length extraction
train['extracted_length'] = train['text'].apply(extract_length)
test['extracted_length'] = test['text'].apply(extract_length)

# Impute missing lengths
train_length_median = train['extracted_length'].median()
train['extracted_length'] = train['extracted_length'].fillna(train_length_median)
test['extracted_length'] = test['extracted_length'].fillna(train_length_median)






  from .autonotebook import tqdm as notebook_tqdm


(2229698, 6)
(20000, 6)


In [2]:
# BERT Embeddings with cleaned text
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

def get_bert_embeddings(texts, batch_size=64):
    model.eval()
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch.tolist(),
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use mean pooling with attention mask
        attention_mask = inputs['attention_mask']
        last_hidden_state = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        batch_embeddings = (sum_embeddings / sum_mask).cpu().numpy()
        
        embeddings.append(batch_embeddings)
    
    return np.concatenate(embeddings, axis=0)

# Generate embeddings
print("Generating BERT embeddings...")
train_emb = get_bert_embeddings(train['text'])
test_emb = get_bert_embeddings(test['text'])

# Combine features
X_train = np.concatenate([
    train_emb,
    train[['PRODUCT_TYPE_ID', 'extracted_length']].values
], axis=1)

X_test = np.concatenate([
    test_emb,
    test[['PRODUCT_TYPE_ID', 'extracted_length']].values
], axis=1)

y_train = train['PRODUCT_LENGTH']

# LightGBM with GPU
model = lgb.LGBMRegressor(
    objective='regression',
    metric='mae',
    num_leaves=255,
    learning_rate=0.05,
    feature_fraction=0.7,
    bagging_fraction=0.7,
    device='gpu',
    n_estimators=1000
)

model.fit(X_train, y_train)




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generating BERT embeddings...


100%|██████████| 34840/34840 [1:21:48<00:00,  7.10it/s]
100%|██████████| 313/313 [00:43<00:00,  7.24it/s]


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 196114
[LightGBM] [Info] Number of data points in the train set: 2229698, number of used features: 770
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 769 dense feature groups (1641.59 MB) transferred to GPU in 0.729651 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 4092.052060


In [3]:
# Generate predictions
test_pred = model.predict(X_test)
test_pred = np.maximum(test_pred, 0)  # Ensure non-negative lengths

# Blend with extracted lengths where available
test_pred = np.where(
    test['extracted_length'].notnull(),
    test['extracted_length'],
    test_pred
)

# Create submission
submission = pd.DataFrame({
    'PRODUCT_ID': test['PRODUCT_ID'],
    'PRODUCT REAL LENGTH':test['']
    'PRODUCT_LENGTH': test_pred
})
submission.to_csv('submission.csv', index=False)



