In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [None]:
train_path = '/content/drive/MyDrive/aiml_challenge/dataset/train.csv'
df = pd.read_csv(train_path)

In [None]:
df.head(1)

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89


In [None]:
df['value'] = df['catalog_content'].str.extract(r'Value\s*:\s*([0-9.]+)')
df['unit'] = df['catalog_content'].str.extract(r'Unit\s*:\s*([A-Za-z]+)')

# Remove value and unit parts from the text to create 'text' column
df['text'] = df['catalog_content'].str.replace(r'Value\s*:\s*[0-9.]+\s*,?\s*', '', regex=True)
df['text'] = df['text'].str.replace(r'Unit\s*:\s*[A-Za-z]+\s*,?\s*', '', regex=True)

In [None]:
df.head(1)

Unnamed: 0,sample_id,catalog_content,image_link,price,value,unit,text
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,72.0,Fl,"Item Name: La Victoria Green Taco Sauce Mild, ..."


In [None]:
# df['text'][2]

In [None]:
df['unit'].unique()

array(['Fl', 'Ounce', 'Count', 'ounce', 'None', 'Fluid', 'count', 'oz',
       'Ounces', 'pound', 'fl', 'gram', 'grams', 'COUNT', 'FL', 'lb',
       'Each', 'Liters', 'gramm', 'ct', 'Pound', 'Oz', 'OZ', 'millilitre',
       'Jar', 'ounces', 'bottle', 'Bottle', 'Gram', 'Can', 'Tea', 'each',
       nan, 'Pack', 'Piece', 'fluid', 'gr', 'milliliter', 'mililitro',
       'CT', 'pack', 'pounds', 'kg', 'Bag', 'Case', 'in', 'K', 'sq', 'ml',
       'Packs', 'box', 'Pouch', 'Bucket', 'LB', 'per', 'Per', 'Comes',
       'units', 'CASE', 'packs', 'BOX', 'product', 'Sq', 'Foot', 'Grams',
       'cm', 'Box', 'unit', 'Paper', 'capsule', 'bottles', 'bag', 'case',
       'Pounds', 'Ziplock', 'ltr', 'PACK', 'can', 'Carton'], dtype=object)

In [None]:
# x.to_csv('df_with_units.csv', index=False)

In [None]:
# df[df['unit'].isnull()]

In [None]:
df['unit'] = df['unit'].str.strip().str.lower()
unit_map = {
    # weight
    'gram': 'g', 'grams': 'g', 'gramm': 'g', 'gr': 'g', 'gram': 'g', 'grams': 'g',
    'kg': 'kg', 'pound': 'lb', 'pounds': 'lb', 'lb': 'lb', 'lbs': 'lb',

    # volume
    'liter': 'l', 'liters': 'l', 'ltr': 'l',
    'milliliter': 'ml', 'millilitre': 'ml', 'mililitro': 'ml',
    'fl': 'fl_oz', 'fluid': 'fl_oz', 'fluid ounce': 'fl_oz', 'ounce': 'oz', 'ounces': 'oz', 'oz': 'oz', 'ounc': 'oz',

    # countable units
    'count': 'count', 'each': 'count', 'ct': 'count', 'piece': 'count', 'pack': 'pack', 'packs': 'pack',
    'unit': 'count', 'units': 'count', 'product': 'count', 'bottle': 'count', 'bottles': 'count',
    'box': 'count', 'boxes': 'count', 'bag': 'count', 'bags': 'count', 'jar': 'count', 'can': 'count', 'carton': 'count',
    'case': 'count', 'bucket': 'count', 'pouch': 'count', 'ziplock': 'count', 'paper': 'count',

    # area/length
    'sq': 'sq_ft', 'foot': 'sq_ft', 'cm': 'cm', 'in': 'inch', 'k': 'count', 'tea': 'count', 'comes': 'count', 'per': 'count'
}

df['unit_normalized'] = df['unit'].map(unit_map).fillna('unknown')


In [None]:
df['value'] = pd.to_numeric(df['value'], errors='coerce')

# Calculate the median for 'value' and unit (mode for unit) columns
value_median = df['value'].median()
unit_median = df['unit'].mode()[0] # Using mode for the most frequent unit

# Fill null values with the calculated medians
df['value'].fillna(value_median, inplace=True)
df['unit'].fillna(unit_median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['value'].fillna(value_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['unit'].fillna(unit_median, inplace=True)


In [None]:
def convert_to_base(quantity, unit):
    if unit == 'kg':
        return quantity * 1000, 'g'
    elif unit == 'lb':
        return quantity * 453.592, 'g'
    elif unit == 'oz':
        return quantity * 28.3495, 'g'
    elif unit == 'fl_oz':
        return quantity * 29.5735, 'ml'
    elif unit == 'l':
        return quantity * 1000, 'ml'
    else:
        return quantity, unit

# Convert the 'value' column to numeric, coercing errors to NaN
df['value'] = pd.to_numeric(df['value'], errors='coerce')


df[['quantity_converted', 'unit_base']] = df.apply(
    lambda row: pd.Series(convert_to_base(row['value'], row['unit_normalized'])), axis=1
)

In [None]:
df['unit_base'].value_counts()

Unnamed: 0_level_0,count
unit_base,Unnamed: 1_level_1
g,44324
count,18342
ml,11356
unknown,951
pack,21
sq_ft,4
inch,1
cm,1


In [None]:
Df = df[['text', 'quantity_converted', 'unit_base','price',]]

In [None]:
Df.head(2)

Unnamed: 0,text,quantity_converted,unit_base,price
0,"Item Name: La Victoria Green Taco Sauce Mild, ...",2129.292,ml,4.89
1,"Item Name: Salerno Cookies, The Original Butte...",907.184,g,13.12


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Df['unit_base_encoded'] = label_encoder.fit_transform(Df['unit_base'])
Df = Df.drop(columns = ['unit_base'],axis =1)
display(Df.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Df['unit_base_encoded'] = label_encoder.fit_transform(Df['unit_base'])


Unnamed: 0,text,quantity_converted,price,unit_base_encoded
0,"Item Name: La Victoria Green Taco Sauce Mild, ...",2129.292,4.89,4
1,"Item Name: Salerno Cookies, The Original Butte...",907.184,13.12,2
2,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",323.1843,1.97,2
3,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,318.931875,30.34,2
4,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",12.0,66.49,1


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop = set(stopwords.words('english'))
lem = WordNetLemmatizer()

def clean_text(x):
    x = re.sub(r'[\n\r\t]+', ' ', x)          # remove newlines/tabs
    x = x.translate(str.maketrans('', '', string.punctuation))  # remove punctuations
    x = x.lower()                             # lowercase
    x = ' '.join([lem.lemmatize(w) for w in x.split() if w not in stop])  # lemmatize + remove stopwords
    return x

Df['text'] = Df['text'].astype(str).apply(clean_text)


In [None]:
# print(df['text'][3])
# print(Df['text'][3])

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(10, 6))
# sns.histplot(Df['price'], bins=50, kde=True, edgecolor='black')
# plt.xlabel('Price')
# plt.ylabel('Frequency')
# plt.title('Distribution of Price with KDE')
# plt.show()

In [None]:
# df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Select features
X = Df[['text', 'quantity_converted', 'unit_base_encoded']]
y = Df['price'] # Use the log of the price for y (standard approach)

Df['log_price'] = np.log1p(y)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500000, ngram_range=(1,2),stop_words='english'), 'text'),
        ('numerical', StandardScaler(), ['quantity_converted'])
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# Create the model pipeline
model = make_pipeline(
    preprocessor,
    Ridge(alpha=1.0)
)

# Train the model using the log price
model.fit(X_train, Df.loc[X_train.index, 'log_price'])



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Make predictions (on log_price scale)
preds_log = model.predict(X_test)

# Inverse transform the predictions back to the original price scale
preds = np.expm1(preds_log)
print(preds)

[11.41455899 34.7252609   7.17522666 ... 14.65600587  4.81979372
 13.897163  ]


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the model using original y_test and inverse-transformed predictions
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse) # Calculate RMSE by taking the square root of MSE
r2 = r2_score(y_test, preds)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

Mean Absolute Error (MAE): 12.588281024163516
Mean Squared Error (MSE): 1046.3112052801491
Root Mean Squared Error (RMSE): 32.34673407440308
R-squared (R2): 0.1964295449325958


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Evaluate the model using original y_test and inverse-transformed predictions
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse) # Calculate RMSE by taking the square root of MSE
r2 = r2_score(y_test, preds)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

Mean Absolute Error (MAE): 12.588281024163516
Mean Squared Error (MSE): 1046.3112052801491
Root Mean Squared Error (RMSE): 32.34673407440308
R-squared (R2): 0.1964295449325958


In [None]:
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2

    return np.mean(numerator / denominator) * 100

# Calculate SMAPE
smape_score = smape(y_test, preds)
print(f'SMAPE: {smape_score}')

SMAPE: 53.81193141573788


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/aiml_challenge/dataset/test.csv')

In [None]:
# Apply the same preprocessing steps to the test dataframe

# Extract value and unit
test_df['value'] = test_df['catalog_content'].str.extract(r'Value\s*:\s*([0-9.]+)')
test_df['unit'] = test_df['catalog_content'].str.extract(r'Unit\s*:\s*([A-Za-z]+)')

# Remove value and unit parts from the text to create 'text' column
test_df['text'] = test_df['catalog_content'].str.replace(r'Value\s*:\s*[0-9.]+\s*,?\s*', '', regex=True)
test_df['text'] = test_df['text'].str.replace(r'Unit\s*:\s*[A-Za-z]+\s*,?\s*', '', regex=True)

# Standardize unit column
test_df['unit'] = test_df['unit'].str.strip().str.lower()
# Use the same unit_map from the training data
test_df['unit_normalized'] = test_df['unit'].map(unit_map).fillna('unknown')

# Convert to base units
# Convert the 'value' column to numeric, coercing errors to NaN
test_df['value'] = pd.to_numeric(test_df['value'], errors='coerce')

# Impute null values in 'value' and 'unit' using medians/modes from training data
# Ensure value_median and unit_median are calculated from the training data (df) before this cell is run
test_df['value'].fillna(value_median, inplace=True)
test_df['unit'].fillna(unit_median, inplace=True)


test_df[['quantity_converted', 'unit_base']] = test_df.apply(
    lambda row: pd.Series(convert_to_base(row['value'], row['unit_normalized'])), axis=1
)

# Encode unit_base using the same LabelEncoder fitted on the training data
# Need to recreate and fit the label encoder on the combined data or save and load the fitted encoder
# For simplicity here, refit on training data and transform test data (assuming same categories)
# A robust approach would save/load the fitted encoder or fit on combined unique values
label_encoder = LabelEncoder()
label_encoder.fit(df['unit_base']) # Fit on the unit_base from the training df
test_df['unit_base_encoded'] = label_encoder.transform(test_df['unit_base'])

# Clean text column (remove 'bullet points' and extra whitespace)
test_df['text'] = test_df['text'].str.replace('bullet points', '', regex=False)
test_df['text'] = test_df['text'].str.replace(r'[\n\r\t]+', ' ', regex=True).str.strip()

# Select features for prediction
X_test_final = test_df[['text', 'quantity_converted', 'unit_base_encoded']]

# Make predictions using the trained model (which includes preprocessing)
# Predictions will be on the log_price scale because the model was trained on log_price
test_preds_log = model.predict(X_test_final)

# Inverse transform the predictions back to the original price scale
test_preds = np.expm1(test_preds_log)

# Add predictions to the test_df
test_df['predicted_price'] = test_preds

# display(test_df.head())

In [None]:
testingdf = test_df.copy()
column = [ 'catalog_content', 'image_link', 'value', 'unit', 'text',
       'unit_normalized', 'quantity_converted', 'unit_base',
       'unit_base_encoded']
testingdf = testingdf.drop(columns = column) # Removed index = False

In [None]:
testingdf = testingdf.rename(columns={'predicted_price': 'price'})
display(testingdf.head())

In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.linear_model import Ridge
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# import numpy as np

# # Select features
# X_text = Df['text'].tolist()
# X_numerical = Df[['quantity_converted', 'unit_base_encoded']]
# y = Df['price']

# # Split data
# X_text_train, X_text_test, X_numerical_train, X_numerical_test, y_train, y_test = train_test_split(
#     X_text, X_numerical, y, test_size=0.35, random_state=42
# )

# # Embed text data
# model_embed = SentenceTransformer('all-MiniLM-L6-v2')
# X_text_train_embed = model_embed.encode(X_text_train, show_progress_bar=True)
# X_text_test_embed = model_embed.encode(X_text_test, show_progress_bar=True)



Batches:   0%|          | 0/1505 [00:00<?, ?it/s]

Batches:   0%|          | 0/811 [00:00<?, ?it/s]

In [None]:
# # Scale numerical data
# scaler = StandardScaler()
# X_numerical_train_scaled = scaler.fit_transform(X_numerical_train)
# X_numerical_test_scaled = scaler.transform(X_numerical_test)


# # Concatenate text embeddings and numerical features
# X_train_combined = np.concatenate((X_text_train_embed, X_numerical_train_scaled), axis=1)
# X_test_combined = np.concatenate((X_text_test_embed, X_numerical_test_scaled), axis=1)

# # Train the Ridge model
# reg = Ridge().fit(X_train_combined, y_train)

# # Make predictions
# preds = reg.predict(X_test_combined)

In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from datasets import Dataset
# import pandas as pd # Ensure pandas is imported if not already

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# # Include numerical features in the dataset
# dataset = Dataset.from_pandas(Df[['text', 'price', 'quantity_converted', 'unit_base_encoded']])

# dataset = dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=256), batched=True)

# dataset = dataset.rename_column("price", "labels")
# # Ensure numerical columns are kept in the format
# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'quantity_converted', 'unit_base_encoded'])


# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# # Note: To effectively use 'quantity_converted' and 'unit_base_encoded' with this model
# # and Trainer, you would typically need to modify the model's forward pass
# # to accept and process these numerical features alongside text embeddings.
# # This would involve custom model architecture changes beyond adding columns to the dataset.

# args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8)
# trainer = Trainer(model=model, args=args, train_dataset=dataset)

# trainer.train()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/74060 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmehul005266[0m ([33mmehul005266-indian-institute-of-technology-kharagpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1207.4127
1000,648.069
1500,738.3758
2000,589.8924
2500,669.4474
3000,755.8745
3500,514.4856
4000,1227.2559
4500,575.659
5000,975.1556


In [None]:
# y_test

Unnamed: 0,price
41740,42.95
71598,5.94
64543,9.99
70735,62.85
50590,5.99
...,...
38888,5.59
39406,22.81
932,3.13
54726,47.50
