In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install transformers



In [4]:
import pandas as pd
import re

# Load your dataset (update the file path)
file_path = '/kaggle/input/datasetfinal/final_data.csv'
df = pd.read_csv(file_path)

# Define the allowed units and entity mapping (as provided)
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Define a function to extract values based on entity_name
def extract_entity(text, entity_name):
    # Define regex patterns based on entity_name
    patterns = {
        'width': r"(\d+(\.\d+)?)\s*(centimetre|foot|inch|metre|millimetre|yard)",
        'depth': r"(\d+(\.\d+)?)\s*(centimetre|foot|inch|metre|millimetre|yard)",
        'height': r"(\d+(\.\d+)?)\s*(centimetre|foot|inch|metre|millimetre|yard)",
        'item_weight': r"(\d+(\.\d+)?)\s*(gram|kilogram|microgram|milligram|ounce|pound|ton)",
        'maximum_weight_recommendation': r"(\d+(\.\d+)?)\s*(gram|kilogram|microgram|milligram|ounce|pound|ton)",
        'voltage': r"(\d+(\.\d+)?)\s*(kilovolt|millivolt|volt)",
        'wattage': r"(\d+(\.\d+)?)\s*(kilowatt|watt)",
        'item_volume': r"(\d+(\.\d+)?)\s*(centilitre|cubic foot|cubic inch|cup|decilitre|fluid ounce|gallon|imperial gallon|litre|microlitre|millilitre|pint|quart)"
    }
    
    pattern = patterns.get(entity_name, None)
    
    if pattern and isinstance(text, str):
        matches = re.findall(pattern, text.lower())
        if matches:
            # Join matches as "x unit" format
            cleaned_value = " ".join([f"{m[0]} {m[2]}" for m in matches])
            return cleaned_value
    return ""

# Apply the extraction function to the 'text_img' column based on 'entity_name'
df['cleaned_entity_value'] = df.apply(lambda row: extract_entity(row['text_img'], row['entity_name']), axis=1)

# Optional: Print out some examples to debug and verify
print("Examples of extracted values:")
print(df[['text_img', 'entity_name', 'cleaned_entity_value']].head(10))

# Filter rows where cleaned_entity_value is not empty
df_cleaned = df[df['cleaned_entity_value'] != ""]

# Save the cleaned data for further use
cleaned_file_path = 'cleaned_full_dataset.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to: {cleaned_file_path}")

Examples of extracted values:
                                            text_img  entity_name  \
0  8 388 G8a 7 3= BltS  S3? F 222~ RZeS 783 Lese ...  item_weight   
1  ~Zl 338 At 333522 283322- Lnili 22282585 {E2EZ...  item_weight   
2  Kn= @nn 3337 ERE 1n= Mnmm 773 523 LV LD 5ee4 0...  item_weight   
3  =serIES} MiNp' BRANDJI ZUR Aoes 3+ EICLUSIVE M...  item_weight   
4  Luminous Engraving Here Weight: About 4.3 g Ma...  item_weight   
5                                  6 8 Ga 6 Mh 5 6 1      wattage   
6  ECORCE DE SAULE BLANC BIO UN PRODUIT DE QUALIT...  item_volume   
7  S88 1382 PIU DI 20 SUONI & FRASIL PARLA IN ITA...  item_weight   
8  SCHWER UND STABIL GENUG Keramik 1.37 kg Andere...  item_weight   
9  DIMENSION { { 5 50.5cm/19.8inch 36-L1cm/1L-16i...      voltage   

  cleaned_entity_value  
0                       
1                       
2                       
3                       
4                       
5                       
6                       
7         

In [23]:
import os
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:

# Disable W&B
os.environ["WANDB_DISABLED"] = "true"

# Define your labels
entity_labels = ['width', 'depth', 'height', 'item_weight', 'maximum_weight_recommendation', 'voltage', 'wattage', 'item_volume']
label_to_id = {label: i for i, label in enumerate(entity_labels)}

In [25]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['text_img'], truncation=True, padding='max_length', max_length=128)
    
    # Initialize labels with -100 for ignored indices
    labels = [[-100] * len(seq) for seq in tokenized_inputs['input_ids']]

    for i, label in enumerate(examples['entity_name']):
        if label in label_to_id:
            label_id = label_to_id[label]
            # Assign the label to all tokens in the sequence
            labels[i] = [label_id] * len(tokenized_inputs['input_ids'][i])
    
    # Convert labels to the correct length
    padded_labels = [label + [-100] * (128 - len(label)) for label in labels]
    
    # Add labels to tokenized inputs
    tokenized_inputs['labels'] = padded_labels
    
    return tokenized_inputs



In [26]:

# Load and preprocess the dataset
dataset = load_dataset('csv', data_files={'data': 'cleaned_full_dataset.csv'})
df = pd.DataFrame(dataset['data'])  # Convert to DataFrame


In [27]:
# Split the DataFrame into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)  # Ensure reproducibility

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [28]:
# Tokenize and align labels for train and validation datasets
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['text_img', 'entity_name'])
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['text_img', 'entity_name'])


Map:   0%|          | 0/20661 [00:00<?, ? examples/s]

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

In [29]:
# Initialize the model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(entity_labels))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',              
    evaluation_strategy="epoch",         
    learning_rate=2e-5,                  
    per_device_train_batch_size=16,      
    per_device_eval_batch_size=16,       
    num_train_epochs=3,                  
    weight_decay=0.01,                   
    logging_dir='./logs',                
    logging_steps=10,                    
    report_to=None                       # Disable reporting to W&B
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [31]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_train,       
    eval_dataset=tokenized_val,          
)

In [32]:
# Train the model
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.9836,0.897863
2,0.9162,0.883018
3,0.9314,0.878599


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1938, training_loss=0.9300589079212233, metrics={'train_runtime': 951.6953, 'train_samples_per_second': 65.129, 'train_steps_per_second': 2.036, 'total_flos': 4049208858175488.0, 'train_loss': 0.9300589079212233, 'epoch': 3.0})

In [35]:
from sklearn.metrics import classification_report
import numpy as np

# Make predictions
predictions = trainer.predict(tokenized_val)
pred_labels = np.argmax(predictions.predictions, axis=-1)

# Flatten the labels and predictions for evaluation
true_labels = np.array(tokenized_val['labels'])
flat_true_labels = [label for sublist in true_labels for label in sublist if label != -100]
flat_pred_labels = [label for sublist in pred_labels for label in sublist if label != -100]

# Print classification report
print(classification_report(flat_true_labels, flat_pred_labels, target_names=entity_labels))


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


                               precision    recall  f1-score   support

                        width       0.37      0.17      0.24     84096
                        depth       0.46      0.62      0.53     84224
                       height       0.43      0.52      0.47     71808
                  item_weight       0.96      0.99      0.97     39424
maximum_weight_recommendation       0.72      0.29      0.42      1280
                      voltage       1.00      0.95      0.97      4864
                      wattage       0.94      1.00      0.97      3968
                  item_volume       0.98      0.93      0.95      4224

                     accuracy                           0.53    293888
                    macro avg       0.73      0.68      0.69    293888
                 weighted avg       0.52      0.53      0.51    293888



In [36]:
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json')

In [40]:
# Load the test dataset
test_df = pd.read_csv('/kaggle/input/datasetfinal/test_data.csv')

# Preview the test dataframe
test_df.head()


Unnamed: 0,index,image_link,group_id,entity_name,text_img
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,3rcn 51 44mui eetcm
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,"Size Width Length One Size 10.50cm/4.13"" 90cm/..."


In [46]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForTokenClassification
import numpy as np

# Define the prediction function
def predict(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predictions = np.argmax(logits.cpu().numpy(), axis=-1)
    
    return predictions[0]  # Return the first sequence of predictions

# Define the formatting function
def format_predictions(preds):
    unit_mapping = {
        0: 'width',
        1: 'depth',
        2: 'height',
        3: 'item_weight',
        4: 'maximum_weight_recommendation',
        5: 'voltage',
        6: 'wattage',
        7: 'item_volume'
    }
    
    formatted_preds = []
    for pred in preds:
        unit = unit_mapping.get(pred, '')
        formatted_preds.append(f"{np.random.uniform(1, 10):.2f} {unit}")
    
    return ' '.join(formatted_preds)

# Load test data
test_df = pd.read_csv('/kaggle/input/datasetfinal/test_data.csv')

# Ensure all entries in 'text_img' are strings
test_df['text_img'] = test_df['text_img'].astype(str)

# Make predictions
predictions = [format_predictions(predict(text)) for text in test_df['text_img']]

# Prepare output DataFrame
output_df = pd.DataFrame({
    'index': test_df.index,
    'prediction': predictions
})

# Save the output DataFrame to CSV
output_df.to_csv('test_out.csv', index=False)


In [47]:
import shutil
shutil.move('test_out.csv', '/kaggle/working/test_out.csv')


'/kaggle/working/test_out.csv'