In [13]:
import pandas as pd
import numpy as np
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import string 
print("Libraries imported successfully.")
TRAIN_PATH = r'C:\Users\inb20\OneDrive\Desktop\Amazon Hackathon\Dataset\train.csv'
TEST_PATH = r'C:\Users\inb20\OneDrive\Desktop\Amazon Hackathon\Dataset\test.csv'

print("File paths are set.")

print("Loading data...")
start_time = time.time()
try:
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    
    all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    all_df['original_content'] = all_df['catalog_content'].fillna('')
    
    print(f"Data loaded in {time.time() - start_time:.2f} seconds.")
except FileNotFoundError as e:
    print(f"ERROR: Could not find the file. Please check the path. Details: {e}")
    exit()

print("Starting text feature engineering")
start_time = time.time()
print("Cleaning text for TF-IDF...")
all_df['clean_content'] = all_df['original_content'].str.lower()

print("Engineering metadata features (length, words, caps)...")
all_df['text_length'] = all_df['original_content'].str.len()
all_df['word_count'] = all_df['original_content'].apply(lambda x: len(x.split()))
all_df['capital_ratio'] = all_df['original_content'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1e-6))

print("Engineering IPQ feature...")
def extract_ipq(text):
    text = str(text).lower()
    patterns = [r'pack of (\d+)', r'(\d+)\s*pack', r'(\d+)\s*count', r'set of (\d+)', r'(\d+)\s*ct', r'(\d+)\s*pk']
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return int(match.group(1))
    return 1
all_df['ipq'] = all_df['clean_content'].apply(extract_ipq)

print("Engineering keyword features...")
keywords = {
    'quality': ['premium', 'organic', 'heavy-duty', 'professional', 'gourmet', 'handmade', 'luxury'],
    'bundling': ['set', 'bundle', 'kit', 'combo', 'pack'],
    'condition': ['refurbished', 'new', 'generic', 'compatible']
}

for category, words in keywords.items():
    all_df[f'kw_{category}'] = all_df['clean_content'].apply(lambda x: 1 if any(word in x for word in words) else 0)

print("Generating core TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=30000,
    stop_words='english',
    token_pattern=r'\b[a-zA-Z0-9]+\b'
)
text_features_tfidf = tfidf_vectorizer.fit_transform(all_df['clean_content'])

print("Combining all engineered features...")
additional_features_df = all_df[[
    'text_length', 'word_count', 'capital_ratio', 'ipq', 
    'kw_quality', 'kw_bundling', 'kw_condition'
]]

additional_features_sparse = csr_matrix(additional_features_df.values)
x_full = hstack([text_features_tfidf, additional_features_sparse], format='csr')

print("All features have been successfully combined.")
x_train = x_full[:len(train_df)]
x_test = x_full[len(train_df):]
y_train = np.log1p(train_df['price'])

print(f"Pre-processing complete in {time.time() - start_time:.2f} seconds.")
print(f"Final training features shape: {x_train.shape}")
print(f"Final testing features shape: {x_test.shape}")
print("\nOutput variables are now ready for model training: x_train, y_train, x_test")

Libraries imported successfully.
File paths are set.
Loading data...
Data loaded in 11.14 seconds.
Starting text feature engineering
Cleaning text for TF-IDF...
Engineering metadata features (length, words, caps)...
Engineering IPQ feature...
Engineering keyword features...
Generating core TF-IDF features...
Combining all engineered features...
All features have been successfully combined.
Pre-processing complete in 61.14 seconds.
Final training features shape: (75000, 30007)
Final testing features shape: (75000, 30007)

Output variables are now ready for model training: x_train, y_train, x_test


In [None]:
import lightgbm as lgb
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib 

print("Libraries imported.")
print("\n Creating validation set and defining parameters...")
x_train_part, x_val, y_train_part, y_val = train_test_split(
    x_train, y_train, test_size=0.1, random_state=42
)

lgb_params = {
    'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 15000,
    'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 40,
    'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt',
}

print("\n Training the LightGBM model...")
start_time = time.time()
model = lgb.LGBMRegressor(**lgb_params)
early_stopping_callback = lgb.early_stopping(stopping_rounds=50, verbose=True)

model.fit(
    x_train_part, y_train_part, eval_set=[(x_val, y_val)],
    eval_metric='rmse', callbacks=[early_stopping_callback]
)
print(f"Model training complete in {time.time() - start_time:.2f} seconds.")
print("\n Saving model and processed data to disk...")
joblib.dump(model, 'lightgbm_model.pkl')
print("Model saved to 'lightgbm_model.pkl'")
joblib.dump(x_test, 'x_test_processed.pkl')
joblib.dump(test_df, 'test_df.pkl')
print("Processed test data saved to 'x_test_processed.pkl' and 'test_df.pkl'")
print("\n Training and saving process complete.")

Libraries imported.

 Creating validation set and defining parameters...

 Training the LightGBM model...
Training until validation scores don't improve for 50 rounds


KeyboardInterrupt: 

In [None]:
import joblib
import pandas as pd
import numpy as np
import time

print("Libraries imported.")
print("\n Loading pre-trained model and processed data...")
start_time = time.time()
try:
    model = joblib.load('lightgbm_model.pkl')
    x_test = joblib.load('x_test_processed.pkl')
    test_df = joblib.load('test_df.pkl')
    print(f"Files loaded successfully in {time.time() - start_time:.2f} seconds.")
except FileNotFoundError:
    print(" ERROR: .pkl files not found. Please run the training script first.")
    exit()
print("\n Generating predictions...")
start_time = time.time()
predictions_log = model.predict(x_test)
predictions = np.expm1(predictions_log)
predictions[predictions < 0] = 0
print(f"Predictions generated in {time.time() - start_time:.2f} seconds.")
print("\n Creating the submission file...")
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': predictions
})
submission_df.to_csv('submission.csv', index=False)

print(f"\n All done! Submission file saved as 'submission.csv'")

Libraries imported.

 Loading pre-trained model and processed data...
Files loaded successfully in 2.61 seconds.

 Generating predictions...




Predictions generated in 63.10 seconds.

 Creating the submission file...

 All done! Submission file saved as 'submission.csv'
