In [None]:
import pandas as pd
df = pd.read_csv('/content/ecommerce_furniture_dataset_2024.csv')
df.head()

In [None]:
baseline_df = df.copy()

baseline_df['price'] = baseline_df['price'].apply(clean_price)
baseline_df['originalPrice'] = baseline_df['originalPrice'].apply(clean_price)
baseline_df['final_price'] = baseline_df['price'].fillna(baseline_df['originalPrice'])

# Basic sold cleaning
baseline_df['sold'] = baseline_df['sold'].apply(clean_sold).fillna(0)

baseline_df = baseline_df.dropna(subset=['final_price']).reset_index(drop=True)

baseline_df.head()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer

# Define use_sentence_transformers to resolve NameError
use_sentence_transformers = False  # Set to True to use sentence-transformers if installed

# Features WITHOUT novelty
baseline_features = ['productTitle', 'sold']

Xb = baseline_df[baseline_features]
yb = baseline_df['final_price']

# Text encoder (same logic)
if use_sentence_transformers:
    # Ensure sentence-transformers is installed and s_model is defined if using this branch
    # from sentence_transformers import SentenceTransformer
    # s_model = SentenceTransformer('all-MiniLM-L6-v2')
    def embed_text_array(texts):
        return s_model.encode(list(texts), show_progress_bar=False)

    baseline_title_pipe = Pipeline([
        ('embed', FunctionTransformer(lambda x: embed_text_array(x.ravel()), validate=False))
    ])
else:
    baseline_title_pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=3000)),
        ('svd', TruncatedSVD(n_components=100, random_state=42))
    ])

baseline_preprocess = ColumnTransformer([
    ('title', baseline_title_pipe, 'productTitle'),
    ('num', StandardScaler(), ['sold'])
])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    Xb, yb, test_size=0.2, random_state=42
)

baseline_model = Pipeline([
    ('pre', baseline_preprocess),
    ('rf', RandomForestRegressor(random_state=42))
])

baseline_model.fit(Xb_train, yb_train)
base_preds = baseline_model.predict(Xb_test)

baseline_mae = mean_absolute_error(yb_test, base_preds)
baseline_mae

In [None]:
import numpy as np
import re

def clean_price(price_str):
    if pd.isna(price_str):
        return np.nan
    try:
        cleaned_price = float(str(price_str).replace('$', '').replace(',', ''))
        return cleaned_price
    except ValueError:
        return np.nan

def clean_sold(sold_str):
    if pd.isna(sold_str):
        return 0
    try:
        return int(float(str(sold_str).replace('+', '')))
    except ValueError:
        return 0

def size_feature(title):
    title_lower = str(title).lower()
    size_match = re.search(r'(\d+(\.\d+)?)\s*(inch|ft|cm|m)', title_lower)
    if size_match:
        value = float(size_match.group(1))
        unit = size_match.group(3)
        # Normalize to inches, for example
        if unit == 'ft':
            return value * 12
        elif unit == 'cm':
            return value / 2.54
        elif unit == 'm':
            return value * 39.37
        return value
    return np.nan

def extract_material(title):
    title_lower = str(title).lower()
    materials = ['wood', 'metal', 'fabric', 'leather', 'plastic', 'glass', 'velvet', 'boucle']
    for mat in materials:
        if mat in title_lower:
            return mat
    return 'other'

def extract_color(title):
    title_lower = str(title).lower()
    colors = ['white', 'black', 'grey', 'gray', 'brown', 'blue', 'green', 'red', 'pink', 'yellow']
    for col in colors:
        if col in title_lower:
            return col
    return 'other'

nov_df = df.copy()

# price cleaning
nov_df['price'] = nov_df['price'].apply(clean_price)
nov_df['originalPrice'] = nov_df['originalPrice'].apply(clean_price)
nov_df['final_price'] = nov_df['price'].fillna(nov_df['originalPrice'])

# sold cleaning
nov_df['sold'] = nov_df['sold'].apply(clean_sold).fillna(0)

# size
nov_df['sizeFeat'] = nov_df['productTitle'].apply(size_feature)

# material
nov_df['material'] = nov_df['productTitle'].apply(extract_material)

# color
nov_df['color'] = nov_df['productTitle'].apply(extract_color)

# discount
nov_df['discount_pct'] = np.where(
    nov_df['originalPrice'] > 0,
    (nov_df['originalPrice'] - nov_df['price']) / nov_df['originalPrice'],
    0
)

nov_df = nov_df.dropna(subset=['final_price']).reset_index(drop=True)
nov_df.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

nov_features = ['productTitle','sold','sizeFeat','material','color','discount_pct']
Xn = nov_df[nov_features]
yn = nov_df['final_price']

if use_sentence_transformers:
    novelty_title_pipe = Pipeline([
        ('embed', FunctionTransformer(lambda x: embed_text_array(x.ravel()), validate=False))
    ])
else:
    novelty_title_pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ('svd', TruncatedSVD(n_components=150, random_state=42))
    ])

nov_preprocess = ColumnTransformer([
    ('title', novelty_title_pipe, 'productTitle'),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['sizeFeat','material','color']),
    ('num', StandardScaler(), ['sold','discount_pct'])
])

In [None]:
Xn_train, Xn_test, yn_train, yn_test = train_test_split(
    Xn, yn, test_size=0.2, random_state=42
)

novelty_model = Pipeline([
    ('pre', nov_preprocess),
    ('rf', RandomForestRegressor(random_state=42))
])

novelty_model.fit(Xn_train, yn_train)
nov_preds = novelty_model.predict(Xn_test)

novelty_mae = mean_absolute_error(yn_test, nov_preds)
novelty_mae


In [None]:
print("Baseline MAE:", baseline_mae)
print("Novelty MAE :", novelty_mae)

improvement = baseline_mae - novelty_mae
print("\nImprovement:", improvement)


In [None]:
import joblib
import os

os.makedirs("model_outputs", exist_ok=True)

# Replace "novelty_model" with your model variable if different
joblib.dump(novelty_model, "furniture_price__model.pkl")

print("Saved as furniture_price_model.pkl")
