In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error
import gc
import os
from tqdm import tqdm
import re


In [None]:
model_name = "microsoft/deberta-v3-xsmall"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model1 = AutoModel.from_pretrained(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device)

In [None]:
def generate_embeddings(texts, batch_size=1024):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:min(len(texts),i+batch_size)]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model1(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
        del inputs, outputs
        torch.cuda.empty_cache()
    return np.vstack(embeddings)


In [None]:
def process_data(file_path,text_columns):
    all_embeddings = []
    all_targets = []
    data=pd.read_csv(file_path)

    text= data.apply(lambda row: preProcess(text_columns, row), axis=1)
    text=text.tolist()

    print(text[0])
    targets = data['PRODUCT_LENGTH'].tolist()  # Adjust column name as needed
    embeddings = generate_embeddings(text)
    gc.collect()
    
    return embeddings, targets



In [None]:
def train_and_evaluate_model(model,X_train, X_test, y_train, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    acc = (100-mean_absolute_percentage_error(y_test, y_pred))
    
    print(f"Score: {acc}")

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    if text[0] == "[" and text[-1] == "]":
        text = '. ' + text[1:-1] + '. '
    text = re.sub(' \|', '.', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.replace(u'\xa0', ' ')
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[\[\]\(\)]', '', text)
    text = re.sub(r'[^a-zA-Z0-9.,!?\'"\s]', '', text)    
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
def preProcess(text_columns,example):
    text=""
    if isinstance(text_columns, list):
        for col in text_columns:
            cur_text = clean_text(example[col])  # Clean each individual text column
            text += cur_text
    else:
        text = clean_text(example[text_columns])  # Clean a single column
    return text

In [None]:

data_file = "/kaggle/input/amazon-product-length-prediction-dataset/dataset/train.csv"
embeddings, targets = process_data(data_file,["TITLE","BULLET_POINTS","DESCRIPTION"])

X_train, X_test, y_train, y_test = train_test_split(embeddings, targets, test_size=0.2, random_state=42)
for model in [LGBMRegressor(verbose=-1),LGBMRegressor(extra_trees = True,verbose=-1),LGBMRegressor(verbose=-1,extra_trees=True,is_unbalance=True)]:
    train_and_evaluate_model(model,X_train, X_test, y_train, y_test)


In [None]:
np.save('deberta_embeddings.npy', embeddings)
np.save('target.npy', targets)