In [None]:
import pandas as pd
import numpy as np
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna
BASE_PATH = r'C:\Users\inb20\OneDrive\Desktop\Amazon Hackathon'
DATA_PATH = BASE_PATH + r'\Dataset'
TRAIN_PATH = DATA_PATH + r'\train.csv'
TEST_PATH = DATA_PATH + r'\test.csv'
TRAIN_IMG_FEATURES_PATH = BASE_PATH + r'\train_image_features.npy'
TEST_IMG_FEATURES_PATH = BASE_PATH + r'\test_image_features.npy'
print("Environment setup is complete.")
print("ENGINEERING ALL FEATURES")
start_time = time.time()
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df['original_content'] = all_df['catalog_content'].fillna('')
all_df['clean_content'] = all_df['original_content'].str.lower()
all_df['text_length'] = all_df['original_content'].str.len()
all_df['word_count'] = all_df['original_content'].apply(lambda x: len(x.split()))
all_df['capital_ratio'] = all_df['original_content'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1e-6))
def extract_ipq(text):
    text = str(text).lower()
    patterns = [r'pack of (\d+)', r'(\d+)\s*pack', r'(\d+)\s*count', r'set of (\d+)', r'(\d+)\s*ct', r'(\d+)\s*pk']
    for p in patterns:
        match = re.search(p, text)
        if match: return int(match.group(1))
    return 1
all_df['ipq'] = all_df['clean_content'].apply(extract_ipq)
keywords = {
    'quality': ['premium', 'organic', 'heavy-duty', 'professional', 'gourmet', 'handmade', 'luxury'],
    'bundling': ['set', 'bundle', 'kit', 'combo', 'pack'],
    'condition': ['refurbished', 'new', 'generic', 'compatible']
}
for category, words in keywords.items():
    all_df[f'kw_{category}'] = all_df['clean_content'].apply(lambda x: 1 if any(word in x for word in words) else 0)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=30000, stop_words='english', token_pattern=r'\b[a-zA-Z0-9]+\b')
text_features_tfidf = tfidf_vectorizer.fit_transform(all_df['clean_content'])
additional_features_df = all_df[['text_length', 'word_count', 'capital_ratio', 'ipq', 'kw_quality', 'kw_bundling', 'kw_condition']]
additional_features_sparse = csr_matrix(additional_features_df.values)
full_text_features = hstack([text_features_tfidf, additional_features_sparse], format='csr')

train_image_features = np.load(TRAIN_IMG_FEATURES_PATH)
test_image_features = np.load(TEST_IMG_FEATURES_PATH)
train_image_sparse = csr_matrix(train_image_features)
test_image_sparse = csr_matrix(test_image_features)

x_train_final = hstack([full_text_features[:len(train_df)], train_image_sparse], format='csr')
x_test_final = hstack([full_text_features[len(train_df):], test_image_sparse], format='csr')
y_train = np.log1p(train_df['price'])

print(f"Feature engineering complete in {time.time() - start_time:.2f} seconds.")
print(f"Final training data shape: {x_train_final.shape}")
print("\n FINDING BEST HYPERPARAMETERS WITH OPTUNA")
start_time = time.time()
x_train_part, x_val, y_train_part, y_val = train_test_split(x_train_final, y_train, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(x_train_part, y_train_part, eval_set=[(x_val, y_val)], eval_metric='rmse', callbacks=[lgb.early_stopping(100, verbose=False)])
    preds = model.predict(x_val)
    rmse = np.sqrt(np.mean((y_val - preds)**2))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30) 

best_lgbm_params = study.best_params
print(f"Optuna study complete in {time.time() - start_time:.2f} seconds.")
print(f"Best RMSE: {study.best_value}")
print(f"Best Hyperparameters: {best_lgbm_params}")

  from .autonotebook import tqdm as notebook_tqdm


Environment setup is complete.

 ENGINEERING ALL FEATURES
Feature engineering complete in 97.68 seconds.
Final training data shape: (75000, 31287)

HYPERPARAMETER TUNING WITH OPTUNA

 FINDING BEST HYPERPARAMETERS WITH OPTUNA


[I 2025-10-12 18:58:19,905] A new study created in memory with name: no-name-2a66e17f-9319-4c89-9faa-ccf18127daa1
[I 2025-10-12 19:23:14,469] Trial 0 finished with value: 0.6993860796916658 and parameters: {'learning_rate': 0.023720756935600704, 'num_leaves': 28, 'feature_fraction': 0.9855033467898788, 'bagging_fraction': 0.6035736005491347, 'bagging_freq': 5, 'lambda_l1': 1.5013765515260067e-08, 'lambda_l2': 1.2651180244396026e-07}. Best is trial 0 with value: 0.6993860796916658.
[I 2025-10-12 20:02:33,012] Trial 1 finished with value: 0.688836736781493 and parameters: {'learning_rate': 0.029085173610015926, 'num_leaves': 88, 'feature_fraction': 0.7632082610542106, 'bagging_fraction': 0.6077864479063269, 'bagging_freq': 1, 'lambda_l1': 0.0006559987906164, 'lambda_l2': 1.7284339762592815e-06}. Best is trial 1 with value: 0.688836736781493.
[I 2025-10-12 20:52:11,552] Trial 2 finished with value: 0.6874484507723471 and parameters: {'learning_rate': 0.09975535529115606, 'num_leaves': 96,

Optuna study complete in 62535.20 seconds.
Best RMSE: 0.6830237637072452
Best Hyperparameters: {'learning_rate': 0.06436419745931313, 'num_leaves': 83, 'feature_fraction': 0.670787164561559, 'bagging_fraction': 0.8862372384842653, 'bagging_freq': 2, 'lambda_l1': 2.1147966897639476e-07, 'lambda_l2': 0.0006020821911774535}

 TRAINING ENSEMBLE OF MODELS...
Training final LightGBM model...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7896]	valid_0's rmse: 0.677654
LightGBM trained and saved in 8692.09 seconds.
Training XGBoost model...


TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
import pandas as pd
import numpy as np
import re
import time
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
import lightgbm as lgb
BASE_PATH = r'C:\Users\inb20\OneDrive\Desktop\Amazon Hackathon'
DATA_PATH = BASE_PATH + r'\Dataset'
TRAIN_PATH = DATA_PATH + r'\train.csv'
TEST_PATH = DATA_PATH + r'\test.csv'
TRAIN_IMG_FEATURES_PATH = BASE_PATH + r'\train_image_features.npy'
TEST_IMG_FEATURES_PATH = BASE_PATH + r'\test_image_features.npy'
print("Environment setup is complete.")
print("\n ENGINEERING ALL FEATURES")
start_time = time.time()
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df['original_content'] = all_df['catalog_content'].fillna('')
all_df['clean_content'] = all_df['original_content'].str.lower()
all_df['text_length'] = all_df['original_content'].str.len()
all_df['word_count'] = all_df['original_content'].apply(lambda x: len(x.split()))
all_df['capital_ratio'] = all_df['original_content'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1e-6))
def extract_ipq(text):
    text = str(text).lower()
    patterns = [r'pack of (\d+)', r'(\d+)\s*pack', r'(\d+)\s*count', r'set of (\d+)', r'(\d+)\s*ct', r'(\d+)\s*pk']
    for p in patterns:
        match = re.search(p, text)
        if match: return int(match.group(1))
    return 1
all_df['ipq'] = all_df['clean_content'].apply(extract_ipq)
keywords = {
    'quality': ['premium', 'organic', 'heavy-duty', 'professional', 'gourmet', 'handmade', 'luxury'],
    'bundling': ['set', 'bundle', 'kit', 'combo', 'pack'],
    'condition': ['refurbished', 'new', 'generic', 'compatible']
}
for category, words in keywords.items():
    all_df[f'kw_{category}'] = all_df['clean_content'].apply(lambda x: 1 if any(word in x for word in words) else 0)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=30000, stop_words='english', token_pattern=r'\b[a-zA-Z0-9]+\b')
text_features_tfidf = tfidf_vectorizer.fit_transform(all_df['clean_content'])
additional_features_df = all_df[['text_length', 'word_count', 'capital_ratio', 'ipq', 'kw_quality', 'kw_bundling', 'kw_condition']]
additional_features_sparse = csr_matrix(additional_features_df.values)
full_text_features = hstack([text_features_tfidf, additional_features_sparse], format='csr')

train_image_features = np.load(TRAIN_IMG_FEATURES_PATH)
test_image_features = np.load(TEST_IMG_FEATURES_PATH)
train_image_sparse = csr_matrix(train_image_features)
test_image_sparse = csr_matrix(test_image_features)

x_train_final = hstack([full_text_features[:len(train_df)], train_image_sparse], format='csr')
x_test_final = hstack([full_text_features[len(train_df):], test_image_sparse], format='csr')
y_train = np.log1p(train_df['price'])

print(f"Feature engineering complete in {time.time() - start_time:.2f} seconds.")
print("\n TRAINING FINAL LIGHTGBM MODEL")
x_train_part, x_val, y_train_part, y_val = train_test_split(x_train_final, y_train, test_size=0.1, random_state=42)
best_lgbm_params = {
    'learning_rate': 0.06436419745931313, 'num_leaves': 83,
    'feature_fraction': 0.670787164561559, 'bagging_fraction': 0.8862372384842653,
    'bagging_freq': 2, 'lambda_l1': 2.1147966897639476e-07, 'lambda_l2': 0.0006020821911774535
}
print("Training final LightGBM model")
start_time = time.time()
lgbm_final_params = {**best_lgbm_params, 'n_estimators': 15000, 'objective': 'regression_l1', 'metric': 'rmse', 'seed': 42, 'n_jobs': -1, 'verbose': -1}
lgbm_model = lgb.LGBMRegressor(**lgbm_final_params)
lgbm_model.fit(x_train_part, y_train_part, eval_set=[(x_val, y_val)], eval_metric='rmse', callbacks=[lgb.early_stopping(100, verbose=True)])
joblib.dump(lgbm_model, BASE_PATH + r'\lgbm_final_model.pkl')
print(f"LightGBM trained and saved in {time.time() - start_time:.2f} seconds.")

print("\n GENERATING FINAL SUBMISSION")
predictions_log = lgbm_model.predict(x_test_final)
final_predictions = np.expm1(predictions_log)
final_predictions[final_predictions < 0] = 0
submission_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': final_predictions})
SUBMISSION_PATH = BASE_PATH + r'\final_lgbm_only_submission.csv'
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"\n Final submission file has been saved to: {SUBMISSION_PATH}")

Environment setup is complete.

 ENGINEERING ALL FEATURES
Feature engineering complete in 123.90 seconds.

PHASE 3: TRAINING FINAL ENSEMBLE OF MODELS...
Training final LightGBM model
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2886]	valid_0's rmse: 0.684679
LightGBM trained and saved in 5151.03 seconds.
Training HistGradientBoostingRegressor model


MemoryError: Unable to allocate 15.7 GiB for an array with shape (67500, 31287) and data type float64

In [None]:
import pandas as pd
import numpy as np
import re
import time
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

BASE_PATH = r'C:\Users\inb20\OneDrive\Desktop\Amazon Hackathon'
DATA_PATH = BASE_PATH + r'\Dataset'
TRAIN_PATH = DATA_PATH + r'\train.csv'
TEST_PATH = DATA_PATH + r'\test.csv'
TRAIN_IMG_FEATURES_PATH = BASE_PATH + r'\train_image_features.npy'
TEST_IMG_FEATURES_PATH = BASE_PATH + r'\test_image_features.npy'
LGBM_MODEL_PATH = BASE_PATH + r'\lgbm_final_model.pkl'
print("Re-creating feature set")
start_time = time.time()
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df['original_content'] = all_df['catalog_content'].fillna('')
all_df['clean_content'] = all_df['original_content'].str.lower()
all_df['text_length'] = all_df['original_content'].str.len()
all_df['word_count'] = all_df['original_content'].apply(lambda x: len(x.split()))
all_df['capital_ratio'] = all_df['original_content'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1e-6))
def extract_ipq(text):
    text = str(text).lower()
    patterns = [r'pack of (\d+)', r'(\d+)\s*pack', r'(\d+)\s*count', r'set of (\d+)', r'(\d+)\s*ct', r'(\d+)\s*pk']
    for p in patterns:
        match = re.search(p, text)
        if match: return int(match.group(1))
    return 1
all_df['ipq'] = all_df['clean_content'].apply(extract_ipq)
keywords = {
    'quality': ['premium', 'organic', 'heavy-duty', 'professional', 'gourmet', 'handmade', 'luxury'],
    'bundling': ['set', 'bundle', 'kit', 'combo', 'pack'],
    'condition': ['refurbished', 'new', 'generic', 'compatible']
}
for category, words in keywords.items():
    all_df[f'kw_{category}'] = all_df['clean_content'].apply(lambda x: 1 if any(word in x for word in words) else 0)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=30000, stop_words='english', token_pattern=r'\b[a-zA-Z0-9]+\b')
text_features_tfidf = tfidf_vectorizer.fit_transform(all_df['clean_content'])
additional_features_df = all_df[['text_length', 'word_count', 'capital_ratio', 'ipq', 'kw_quality', 'kw_bundling', 'kw_condition']]
additional_features_sparse = csr_matrix(additional_features_df.values)
full_text_features = hstack([text_features_tfidf, additional_features_sparse], format='csr')
train_image_features = np.load(TRAIN_IMG_FEATURES_PATH)
test_image_features = np.load(TEST_IMG_FEATURES_PATH)
train_image_sparse = csr_matrix(train_image_features)
test_image_sparse = csr_matrix(test_image_features)
x_test_final = hstack([full_text_features[len(train_df):], test_image_sparse], format='csr')
print(f"Feature set re-created in {time.time() - start_time:.2f} seconds.")
print("Loading the pre-trained LightGBM model")
try:
    lgbm_model = joblib.load(LGBM_MODEL_PATH)
except FileNotFoundError:
    print("CRITICAL ERROR: 'lgbm_final_model.pkl' not found. Cannot create submission.")
    exit()
print("Making final predictions")
predictions_log = lgbm_model.predict(x_test_final)
final_predictions = np.expm1(predictions_log)
final_predictions[final_predictions < 0] = 0
submission_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': final_predictions})
SUBMISSION_PATH = BASE_PATH + r'\Final_submission.csv'
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"Submission file has been saved to: {SUBMISSION_PATH}")

Re-creating feature set
