In [None]:
import pandas as pd

train_path = '../../../dataset/train.csv'
test_path = '../../../dataset/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path) 
    
print("Training data loaded successfully!")
print(train_df.head())


Training data loaded successfully!
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judeeâ€™s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  


In [2]:
import re
import numpy as np

df = pd.read_csv('/kaggle/input/student-resource/student_resource/dataset/train.csv')


def extract_ipq(text):
    match = re.search(r'\(Pack of (\d+)\)|(\d+) per case|PK- (\d+)', str(text), re.IGNORECASE)
    if match:
        return int(next(item for item in match.groups() if item is not None))
    return 1

def extract_quantity_oz(text):
    match = re.search(r'(\d+\.?\d*)\s*(Ounce|Oz|Pound)', str(text), re.IGNORECASE)
    if match:
        value = float(match.group(1))
        unit = match.group(2).lower()
        if 'pound' in unit:
            return value * 16
        return value
    return 

df['ipq'] = df['catalog_content'].apply(extract_ipq)
df['quantity_oz'] = df['catalog_content'].apply(extract_quantity_oz)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'item name:|bullet point \d+:', '', text) 
    text = re.sub(r'[^a-z0-9\s]', '', text) 
    return text

df['cleaned_text'] = df['catalog_content'].apply(clean_text)

print("Feature engineering complete.")
print(df[['ipq', 'quantity_oz', 'cleaned_text']].head())

Feature engineering complete.
   ipq  quantity_oz                                       cleaned_text
0    6        12.00   la victoria green taco sauce mild 12 ounce pa...
1    4         8.00   salerno cookies the original butter cookies 8...
2    6         1.90   bear creek hearty soup bowl creamy chicken wi...
3    1        11.25   judees blue cheese powder 1125 oz  glutenfree...
4   12        12.70   kedem sherry cooking wine 127 ounce  12 per c...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix


vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
text_features = vectorizer.fit_transform(df['cleaned_text'])
numerical_features = df[['ipq', 'quantity_oz']].values
X = hstack([text_features, csr_matrix(numerical_features)])
y = np.log1p(df['price'])

print(f"Shape of our final feature matrix (X): {X.shape}")
print(f"Shape of our final target vector (y): {y.shape}")

Vectorizing text features...
Combining text and numerical features...

Data preparation complete.
Shape of our final feature matrix (X): (75000, 5002)
Shape of our final target vector (y): (75000,)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")

Data splitting complete.
Training features shape: (60000, 5002)
Validation features shape: (15000, 5002)


In [5]:
import lightgbm as lgb

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

# --- LightGBM Model ---
lgb_params = {
    'objective': 'regression_l1',
    'metric': 'rmse',
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

model = lgb.LGBMRegressor(**lgb_params)

model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          eval_metric='rmse',
          callbacks=[lgb.early_stopping(100, verbose=True)])

log_preds = model.predict(X_val)

y_val_actual = np.expm1(y_val)
final_preds = np.expm1(log_preds)

smape_score = smape(y_val_actual, final_preds)
print(f"\nValidation SMAPE Score: {smape_score:.4f}%")

Training LightGBM model...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 0.719153

Evaluating model performance...

Validation SMAPE Score: 53.2827%


In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# --- 1. Advanced Feature Engineering ---
print("Running advanced feature engineering...")

# Extract Brand: A simple heuristic is to take the first 2-3 words after "Item Name:"
def extract_brand(text):
    try:
        # Find text after "Item Name:" and before the next potential separator (like ',')
        name_part = re.search(r'Item Name:\s*([^,]+)', str(text)).group(1)
        # Take the first two words as a potential brand
        return ' '.join(name_part.split()[:2])
    except:
        return 'unknown'

df['brand'] = df['catalog_content'].apply(extract_brand)

# Convert the categorical brand names into numerical codes
df['brand_code'], _ = pd.factorize(df['brand'])

# (Assuming ipq, quantity_oz, and cleaned_text are already created)
print(df[['brand', 'brand_code']].head())


# --- 2. Improved Text Vectorization ---
print("\nRunning improved text vectorization with n-grams...")
vectorizer = TfidfVectorizer(
    max_features=10000,      # Increased features
    stop_words='english',
    ngram_range=(1, 2)       # Captures two-word phrases
)
text_features = vectorizer.fit_transform(df['cleaned_text'])

# Combine ALL features: TF-IDF + ipq + quantity_oz + brand_code
numerical_features = df[['ipq', 'quantity_oz', 'brand_code']].values
X = hstack([text_features, csr_matrix(numerical_features)])
y = np.log1p(df['price'])
print(f"New feature matrix shape: {X.shape}")


# --- 3. Split Data ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# --- 4. Train with Updated Parameters ---
print("\nTraining LightGBM with updated parameters...")
lgb_params = {
    'objective': 'regression_l1',
    'metric': 'rmse',
    'n_estimators': 25000,      # More estimators
    'learning_rate': 0.01,    # Slower learning rate
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

model = lgb.LGBMRegressor(**lgb_params)
model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          eval_metric='rmse',
          callbacks=[lgb.early_stopping(100, verbose=True)])

# --- 5. Evaluate ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

print("\nEvaluating new model...")
log_preds = model.predict(X_val)
y_val_actual = np.expm1(y_val)
final_preds = np.expm1(log_preds)
smape_score = smape(y_val_actual, final_preds)

print(f"\nNew Validation SMAPE Score: {smape_score:.4f}%")