In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [None]:
train_path = '/content/drive/MyDrive/aiml_challenge/dataset/train.csv'
df = pd.read_csv(train_path)

In [None]:
df.head(1)

In [None]:
df['value'] = df['catalog_content'].str.extract(r'Value\s*:\s*([0-9.]+)')
df['unit'] = df['catalog_content'].str.extract(r'Unit\s*:\s*([A-Za-z]+)')

# Remove value and unit parts from the text to create 'text' column
df['text'] = df['catalog_content'].str.replace(r'Value\s*:\s*[0-9.]+\s*,?\s*', '', regex=True)
df['text'] = df['text'].str.replace(r'Unit\s*:\s*[A-Za-z]+\s*,?\s*', '', regex=True)

In [None]:
df.head(1)

In [None]:
# df['text'][2]

In [None]:
df['unit'].unique()

In [None]:
# x.to_csv('df_with_units.csv', index=False)

In [None]:
# df[df['unit'].isnull()]

In [None]:
df['unit'] = df['unit'].str.strip().str.lower()
unit_map = {
    # weight
    'gram': 'g', 'grams': 'g', 'gramm': 'g', 'gr': 'g', 'gram': 'g', 'grams': 'g',
    'kg': 'kg', 'pound': 'lb', 'pounds': 'lb', 'lb': 'lb', 'lbs': 'lb',

    # volume
    'liter': 'l', 'liters': 'l', 'ltr': 'l',
    'milliliter': 'ml', 'millilitre': 'ml', 'mililitro': 'ml',
    'fl': 'fl_oz', 'fluid': 'fl_oz', 'fluid ounce': 'fl_oz', 'ounce': 'oz', 'ounces': 'oz', 'oz': 'oz', 'ounc': 'oz',

    # countable units
    'count': 'count', 'each': 'count', 'ct': 'count', 'piece': 'count', 'pack': 'pack', 'packs': 'pack',
    'unit': 'count', 'units': 'count', 'product': 'count', 'bottle': 'count', 'bottles': 'count',
    'box': 'count', 'boxes': 'count', 'bag': 'count', 'bags': 'count', 'jar': 'count', 'can': 'count', 'carton': 'count',
    'case': 'count', 'bucket': 'count', 'pouch': 'count', 'ziplock': 'count', 'paper': 'count',

    # area/length
    'sq': 'sq_ft', 'foot': 'sq_ft', 'cm': 'cm', 'in': 'inch', 'k': 'count', 'tea': 'count', 'comes': 'count', 'per': 'count'
}

df['unit_normalized'] = df['unit'].map(unit_map).fillna('unknown')


In [None]:
df['value'] = pd.to_numeric(df['value'], errors='coerce')

# Calculate the median for 'value' and unit (mode for unit) columns
value_median = df['value'].median()
unit_median = df['unit'].mode()[0] # Using mode for the most frequent unit

# Fill null values with the calculated medians
df['value'].fillna(value_median, inplace=True)
df['unit'].fillna(unit_median, inplace=True)

In [None]:
def convert_to_base(quantity, unit):
    if unit == 'kg':
        return quantity * 1000, 'g'
    elif unit == 'lb':
        return quantity * 453.592, 'g'
    elif unit == 'oz':
        return quantity * 28.3495, 'g'
    elif unit == 'fl_oz':
        return quantity * 29.5735, 'ml'
    elif unit == 'l':
        return quantity * 1000, 'ml'
    else:
        return quantity, unit

# Convert the 'value' column to numeric, coercing errors to NaN
df['value'] = pd.to_numeric(df['value'], errors='coerce')


df[['quantity_converted', 'unit_base']] = df.apply(
    lambda row: pd.Series(convert_to_base(row['value'], row['unit_normalized'])), axis=1
)

In [None]:
df['unit_base'].value_counts()

In [None]:
Df = df[['text', 'quantity_converted', 'unit_base','price',]]

In [None]:
Df.head(2)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Df['unit_base_encoded'] = label_encoder.fit_transform(Df['unit_base'])
Df = Df.drop(columns = ['unit_base'],axis =1)
display(Df.head())

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
import nltk
nltk.download('wordnet')

In [None]:
import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop = set(stopwords.words('english'))
lem = WordNetLemmatizer()

def clean_text(x):
    x = re.sub(r'[\n\r\t]+', ' ', x)          # remove newlines/tabs
    x = x.translate(str.maketrans('', '', string.punctuation))  # remove punctuations
    x = x.lower()                             # lowercase
    x = ' '.join([lem.lemmatize(w) for w in x.split() if w not in stop])  # lemmatize + remove stopwords
    return x

Df['text'] = Df['text'].astype(str).apply(clean_text)


In [None]:
# print(df['text'][3])
# print(Df['text'][3])

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(10, 6))
# sns.histplot(Df['price'], bins=50, kde=True, edgecolor='black')
# plt.xlabel('Price')
# plt.ylabel('Frequency')
# plt.title('Distribution of Price with KDE')
# plt.show()

In [None]:
# df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Select features
X = Df[['text', 'quantity_converted', 'unit_base_encoded']]
y = Df['price'] # Use the log of the price for y (standard approach)

Df['log_price'] = np.log1p(y)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500000, ngram_range=(1,2),stop_words='english'), 'text'),
        ('numerical', StandardScaler(), ['quantity_converted'])
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# Create the model pipeline
model = make_pipeline(
    preprocessor,
    Ridge(alpha=1.0)
)

# Train the model using the log price
model.fit(X_train, Df.loc[X_train.index, 'log_price'])



In [None]:
# Make predictions (on log_price scale)
preds_log = model.predict(X_test)

# Inverse transform the predictions back to the original price scale
preds = np.expm1(preds_log)
print(preds)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the model using original y_test and inverse-transformed predictions
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse) # Calculate RMSE by taking the square root of MSE
r2 = r2_score(y_test, preds)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Evaluate the model using original y_test and inverse-transformed predictions
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse) # Calculate RMSE by taking the square root of MSE
r2 = r2_score(y_test, preds)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

In [None]:
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2

    return np.mean(numerator / denominator) * 100

# Calculate SMAPE
smape_score = smape(y_test, preds)
print(f'SMAPE: {smape_score}')

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/aiml_challenge/dataset/test.csv')

In [None]:
# Apply the same preprocessing steps to the test dataframe

# Extract value and unit
test_df['value'] = test_df['catalog_content'].str.extract(r'Value\s*:\s*([0-9.]+)')
test_df['unit'] = test_df['catalog_content'].str.extract(r'Unit\s*:\s*([A-Za-z]+)')

# Remove value and unit parts from the text to create 'text' column
test_df['text'] = test_df['catalog_content'].str.replace(r'Value\s*:\s*[0-9.]+\s*,?\s*', '', regex=True)
test_df['text'] = test_df['text'].str.replace(r'Unit\s*:\s*[A-Za-z]+\s*,?\s*', '', regex=True)

# Standardize unit column
test_df['unit'] = test_df['unit'].str.strip().str.lower()
# Use the same unit_map from the training data
test_df['unit_normalized'] = test_df['unit'].map(unit_map).fillna('unknown')

# Convert to base units
# Convert the 'value' column to numeric, coercing errors to NaN
test_df['value'] = pd.to_numeric(test_df['value'], errors='coerce')

# Impute null values in 'value' and 'unit' using medians/modes from training data
# Ensure value_median and unit_median are calculated from the training data (df) before this cell is run
test_df['value'].fillna(value_median, inplace=True)
test_df['unit'].fillna(unit_median, inplace=True)


test_df[['quantity_converted', 'unit_base']] = test_df.apply(
    lambda row: pd.Series(convert_to_base(row['value'], row['unit_normalized'])), axis=1
)

# Encode unit_base using the same LabelEncoder fitted on the training data
# Need to recreate and fit the label encoder on the combined data or save and load the fitted encoder
# For simplicity here, refit on training data and transform test data (assuming same categories)
# A robust approach would save/load the fitted encoder or fit on combined unique values
label_encoder = LabelEncoder()
label_encoder.fit(df['unit_base']) # Fit on the unit_base from the training df
test_df['unit_base_encoded'] = label_encoder.transform(test_df['unit_base'])

# Clean text column (remove 'bullet points' and extra whitespace)
test_df['text'] = test_df['text'].str.replace('bullet points', '', regex=False)
test_df['text'] = test_df['text'].str.replace(r'[\n\r\t]+', ' ', regex=True).str.strip()

# Select features for prediction
X_test_final = test_df[['text', 'quantity_converted', 'unit_base_encoded']]

# Make predictions using the trained model (which includes preprocessing)
# Predictions will be on the log_price scale because the model was trained on log_price
test_preds_log = model.predict(X_test_final)

# Inverse transform the predictions back to the original price scale
test_preds = np.expm1(test_preds_log)

# Add predictions to the test_df
test_df['predicted_price'] = test_preds

# display(test_df.head())

In [None]:
testingdf = test_df.copy()
column = [ 'catalog_content', 'image_link', 'value', 'unit', 'text',
       'unit_normalized', 'quantity_converted', 'unit_base',
       'unit_base_encoded']
testingdf = testingdf.drop(columns = column) # Removed index = False

In [None]:
testingdf = testingdf.rename(columns={'predicted_price': 'price'})
display(testingdf.head())

In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.linear_model import Ridge
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# import numpy as np

# # Select features
# X_text = Df['text'].tolist()
# X_numerical = Df[['quantity_converted', 'unit_base_encoded']]
# y = Df['price']

# # Split data
# X_text_train, X_text_test, X_numerical_train, X_numerical_test, y_train, y_test = train_test_split(
#     X_text, X_numerical, y, test_size=0.35, random_state=42
# )

# # Embed text data
# model_embed = SentenceTransformer('all-MiniLM-L6-v2')
# X_text_train_embed = model_embed.encode(X_text_train, show_progress_bar=True)
# X_text_test_embed = model_embed.encode(X_text_test, show_progress_bar=True)



In [None]:
# # Scale numerical data
# scaler = StandardScaler()
# X_numerical_train_scaled = scaler.fit_transform(X_numerical_train)
# X_numerical_test_scaled = scaler.transform(X_numerical_test)


# # Concatenate text embeddings and numerical features
# X_train_combined = np.concatenate((X_text_train_embed, X_numerical_train_scaled), axis=1)
# X_test_combined = np.concatenate((X_text_test_embed, X_numerical_test_scaled), axis=1)

# # Train the Ridge model
# reg = Ridge().fit(X_train_combined, y_train)

# # Make predictions
# preds = reg.predict(X_test_combined)

In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from datasets import Dataset
# import pandas as pd # Ensure pandas is imported if not already

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# # Include numerical features in the dataset
# dataset = Dataset.from_pandas(Df[['text', 'price', 'quantity_converted', 'unit_base_encoded']])

# dataset = dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=256), batched=True)

# dataset = dataset.rename_column("price", "labels")
# # Ensure numerical columns are kept in the format
# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'quantity_converted', 'unit_base_encoded'])


# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# # Note: To effectively use 'quantity_converted' and 'unit_base_encoded' with this model
# # and Trainer, you would typically need to modify the model's forward pass
# # to accept and process these numerical features alongside text embeddings.
# # This would involve custom model architecture changes beyond adding columns to the dataset.

# args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8)
# trainer = Trainer(model=model, args=args, train_dataset=dataset)

# trainer.train()

In [None]:
# y_test