# üì¶ Imports and Setup

In [1]:
!pip install lightgbm -q


In [2]:

import os
import re
import gc
import json
import math
import time
import string
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge

import lightgbm as lgb
import joblib


# ‚öôÔ∏è Utility Function and Data Loading

In [3]:

def smape(y_true, y_pred):
    """Symmetric mean absolute percentage error in percent."""
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    num = np.abs(y_pred - y_true)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.where(denom == 0, 1e-6, denom)
    return np.mean(num / denom) * 100.0

TRAIN_FILE = Path('train.csv')
TEST_FILE = Path('test.csv')

train = pd.read_csv(TRAIN_FILE, engine='python', on_bad_lines='warn')
test = pd.read_csv(TEST_FILE, engine='python', on_bad_lines='warn')

print('train', train.shape)
print('test ', test.shape)


train (75000, 4)
test  (75000, 3)


# üß† Enhanced Feature Engineering

In [4]:

PUNCT_RE = re.compile('[%s]' % re.escape(string.punctuation))
NUM_RE = re.compile(r"(\d+[\.,]?\d*)")

def clean_text(s):
    if pd.isna(s):
        return ''
    s = str(s).lower()
    s = s.replace('\n', ' ').replace('\r', ' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def count_numbers(s):
    return len(NUM_RE.findall(s or ''))

def extract_numbers(s):
    if not s:
        return []
    return [float(x.replace(',', '')) for x in NUM_RE.findall(str(s))]

def extract_units(s):
    if not s:
        return {}
    s = str(s).lower()
    units = {
        'oz': len(re.findall(r'\b\d+[\.\d]*\s*oz\b', s)),
        'lb': len(re.findall(r'\b\d+[\.\d]*\s*lb\b', s)),
        'g': len(re.findall(r'\b\d+[\.\d]*\s*g\b', s)),
        'kg': len(re.findall(r'\b\d+[\.\d]*\s*kg\b', s)),
        'ml': len(re.findall(r'\b\d+[\.\d]*\s*ml\b', s)),
        'l': len(re.findall(r'\b\d+[\.\d]*\s*l\b', s)),
        'count': len(re.findall(r'\b\d+[\.\d]*\s*count\b', s)),
        'pack': len(re.findall(r'pack\s*of\s*\d+|\d+\s*pack', s))
    }
    return units

def extract_value_unit(s):
    if not s:
        return -1, 'unknown'
    value_match = re.search(r'value:\s*(\d+[\.\d]*)', str(s).lower())
    unit_match = re.search(r'unit:\s*(\w+)', str(s).lower())
    value = float(value_match.group(1)) if value_match else -1
    unit = unit_match.group(1) if unit_match else 'unknown'
    return value, unit


In [5]:

train['catalog_clean'] = train['catalog_content'].fillna('').map(clean_text)
test['catalog_clean'] = test['catalog_content'].fillna('').map(clean_text)

train['text_len'] = train['catalog_clean'].str.len().fillna(0)
train['word_count'] = train['catalog_clean'].str.split().map(lambda x: len(x) if isinstance(x, list) else 0)
train['num_count'] = train['catalog_clean'].map(count_numbers)

test['text_len'] = test['catalog_clean'].str.len().fillna(0)
test['word_count'] = test['catalog_clean'].str.split().map(lambda x: len(x) if isinstance(x, list) else 0)
test['num_count'] = test['catalog_clean'].map(count_numbers)

def extract_numeric_features(df):
    all_nums = df['catalog_clean'].map(extract_numbers)
    df['num_max'] = all_nums.map(lambda x: max(x) if x else -1)
    df['num_min'] = all_nums.map(lambda x: min(x) if x else -1)
    df['num_mean'] = all_nums.map(lambda x: np.mean(x) if x else -1)
    df['num_sum'] = all_nums.map(lambda x: sum(x) if x else -1)

    units_data = df['catalog_clean'].map(extract_units)
    for unit in ['oz', 'lb', 'g', 'kg', 'ml', 'l', 'count', 'pack']:
        df[f'has_{unit}'] = units_data.map(lambda x: x.get(unit, 0))

    value_unit = df['catalog_clean'].map(extract_value_unit)
    df['product_value'] = value_unit.map(lambda x: x[0])
    df['product_unit'] = value_unit.map(lambda x: x[1])
    return df

train = extract_numeric_features(train)
test = extract_numeric_features(test)


# üè∑Ô∏è Additional Features (IPQ, Brand, Units)

In [6]:

IPQ_PATTERNS = [
    r"pack of\s*(\d+)",
    r"\b(\d+)\s*pack\b",
    r"\b(\d+)\s*pcs?\b",
    r"\b(\d+)\s*count\b",
    r"\b(\d+)[\\s-]?ml\b",
    r"\b(\d+)[\\s-]?g\b",
    r"\b(\d+)\s*x\b",
    r"\((\d+)\)",
]

def extract_ipq_val(s):
    if not s:
        return -1
    s = str(s)
    for pat in IPQ_PATTERNS:
        m = re.search(pat, s)
        if m:
            try:
                return int(m.group(1))
            except:
                continue
    return -1

train['ipq'] = train['catalog_clean'].map(extract_ipq_val)
test['ipq'] = test['catalog_clean'].map(extract_ipq_val)

def guess_brand(s):
    if not s:
        return 'unknown'
    s0 = s.split(' - ')[0]
    s0 = s0.split(':')[0]
    s0 = s0.split('|')[0]
    m = re.search(r"by\s+([a-z0-9\-\&]+)", s0)
    if m:
        return m.group(1)
    tok = s0.split()[0] if s0.split() else 'unknown'
    return tok

train['brand_guess'] = train['catalog_clean'].map(guess_brand)
test['brand_guess'] = test['catalog_clean'].map(guess_brand)

brand_counts = train['brand_guess'].value_counts().to_dict()
train['brand_freq'] = train['brand_guess'].map(lambda x: brand_counts.get(x, 0))
test['brand_freq'] = test['brand_guess'].map(lambda x: brand_counts.get(x, 0))

unit_counts = train['product_unit'].value_counts().to_dict()
train['unit_freq'] = train['product_unit'].map(lambda x: unit_counts.get(x, 0))
test['unit_freq'] = test['product_unit'].map(lambda x: unit_counts.get(x, 0))

train['price_per_word'] = train['price'] / (train['word_count'] + 1)
train['price_per_num'] = train['price'] / (train['num_count'] + 1)

test['price_per_word'] = -1
test['price_per_num'] = -1


# üî† TF-IDF and SVD Feature Extraction

In [8]:

print("Creating TF-IDF features...")
word_vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 3),
    min_df=1,
    max_df=0.9,
    sublinear_tf=True
)

char_vectorizer = TfidfVectorizer(
    max_features=20000,
    analyzer='char',
    ngram_range=(3, 5),
    min_df=2,
    sublinear_tf=True
)

X_word_train = word_vectorizer.fit_transform(train['catalog_clean'].fillna(''))
X_word_test = word_vectorizer.transform(test['catalog_clean'].fillna(''))

X_char_train = char_vectorizer.fit_transform(train['catalog_clean'].fillna(''))
X_char_test = char_vectorizer.transform(test['catalog_clean'].fillna(''))

print("Applying SVD...")
svd = TruncatedSVD(n_components=50, random_state=42)
X_svd_train = svd.fit_transform(X_word_train)
X_svd_test = svd.transform(X_word_test)


Creating TF-IDF features...
Applying SVD...


# üß© Combine Features

In [9]:

num_features = [
    'text_len', 'word_count', 'num_count', 'ipq', 'brand_freq',
    'num_max', 'num_min', 'num_mean', 'num_sum',
    'has_oz', 'has_lb', 'has_g', 'has_kg', 'has_ml', 'has_l', 'has_count', 'has_pack',
    'product_value', 'unit_freq'
]

X_num_train = train[num_features].fillna(-1).values
X_num_test = test[num_features].fillna(-1).values

scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_test = scaler.transform(X_num_test)

X_train = sparse.hstack([
    X_word_train,
    X_char_train,
    sparse.csr_matrix(X_svd_train),
    sparse.csr_matrix(X_num_train)
]).tocsr()

X_test = sparse.hstack([
    X_word_test,
    X_char_test,
    sparse.csr_matrix(X_svd_test),
    sparse.csr_matrix(X_num_test)
]).tocsr()

y = np.log1p(train['price'].values)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)


X_train shape: (75000, 70069)
X_test shape: (75000, 70069)


# üöÄ LightGBM Model Training

In [10]:

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.03,
    'num_leaves': 63,
    'max_depth': 8,
    'min_data_in_leaf': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'min_gain_to_split': 0.01,
    'verbose': -1,
    'n_jobs': -1,
    'force_row_wise': True
}

NFOLD = 5
kf = KFold(n_splits=NFOLD, shuffle=True, random_state=42)

preds_oof = np.zeros(len(train))
preds_test = np.zeros(len(test))

print("\nStarting cross-validation...")
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f'\n{"="*50}')
    print(f'Fold {fold+1}/{NFOLD}')
    print(f'{"="*50}')
    
    X_tr = X_train[tr_idx]
    X_val = X_train[val_idx]
    y_tr = y[tr_idx]
    y_val = y[val_idx]
    
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    
    model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dtrain, dval],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=150, verbose=False),
            lgb.log_evaluation(period=200)
        ]
    )
    
    preds_oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    preds_test += model.predict(X_test, num_iteration=model.best_iteration) / NFOLD
    
    fold_smape = smape(np.expm1(y_val), np.expm1(preds_oof[val_idx]))
    print(f'Fold {fold+1} SMAPE: {fold_smape:.4f}')
    
    joblib.dump(model, f'lgb_enhanced_fold{fold+1}.pkl')
    
    del dtrain, dval, X_tr, X_val, y_tr, y_val
    gc.collect()



Starting cross-validation...

Fold 1/5
[200]	train's rmse: 0.654866	valid's rmse: 0.720837
[400]	train's rmse: 0.613359	valid's rmse: 0.703382
[600]	train's rmse: 0.586309	valid's rmse: 0.694516
[800]	train's rmse: 0.564076	valid's rmse: 0.689256
[1000]	train's rmse: 0.547071	valid's rmse: 0.685004
[1200]	train's rmse: 0.530883	valid's rmse: 0.681949
[1400]	train's rmse: 0.515839	valid's rmse: 0.679292
[1600]	train's rmse: 0.50316	valid's rmse: 0.677697
[1800]	train's rmse: 0.491209	valid's rmse: 0.676065
[2000]	train's rmse: 0.479708	valid's rmse: 0.674702
[2200]	train's rmse: 0.46848	valid's rmse: 0.673308
[2400]	train's rmse: 0.45795	valid's rmse: 0.672301
[2600]	train's rmse: 0.448197	valid's rmse: 0.671604
[2800]	train's rmse: 0.438236	valid's rmse: 0.671069
[3000]	train's rmse: 0.429111	valid's rmse: 0.670576
[3200]	train's rmse: 0.420858	valid's rmse: 0.670223
[3400]	train's rmse: 0.411974	valid's rmse: 0.669797
[3600]	train's rmse: 0.403663	valid's rmse: 0.669387
[3800]	train'

# üìä Evaluation and Submission

In [11]:

train_pred_price = np.expm1(preds_oof)
cv_smape = smape(train['price'].values, train_pred_price)
print(f'\n{"="*50}')
print(f'Enhanced Model CV SMAPE: {cv_smape:.4f}')
print(f'{"="*50}')

test_pred_price = np.clip(np.expm1(preds_test), 0.01, None)
submission = pd.DataFrame({
    'sample_id': test['sample_id'],
    'price': test_pred_price
})
submission.to_csv('test_out_enhanced.csv', index=False)
print('\nSaved test_out_enhanced.csv')

train_oof = pd.DataFrame({
    'sample_id': train['sample_id'],
    'price': train['price'],
    'pred_price': train_pred_price
})
train_oof.to_csv('train_oof_enhanced.csv', index=False)
print('Saved train_oof_enhanced.csv')



Enhanced Model CV SMAPE: 49.6720

Saved test_out_enhanced.csv
Saved train_oof_enhanced.csv
