In [32]:
import pandas as pd
import os
import re
import torch
from torchvision import models, transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import requests
import io
import numpy as np


In [33]:
train_df = pd.read_csv("train.csv") # So this Train_df contains coloumns like index,image_link,group_id,entity_name,  entity_value(target value)



test_df = pd.read_csv('test.csv') # So this is the test_df coloumns like index,image_link,group_id,entity_name Target - entity value


In [36]:
# This is one of the preprocessing steps to perform.
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),          
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  
])


# This function will take the image_url and convert it to tensor.So basically,tensor is more easier to work with the model we are going to use ahead.
def download_and_preprocess_image(image_url):
    try:
        response = requests.get(image_url)
        image = Image.open(io.BytesIO(response.content)).convert('RGB')  
        image_tensor = preprocess(image).unsqueeze(0) 
        return image_tensor
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return None

# We apply this function to our training data
train_df['image_tensor'] = train_df['image_link'].apply(download_and_preprocess_image)
# Now the train_df has the image_tensor coloumn


In [37]:
# We will be using resnet50 model from pytorch
model = models.resnet50(pretrained=True)
model.eval()

# This function takes in the image tensor and returns with the features(array) from the image tensor
def extract_features(image_tensor):
    with torch.no_grad():
        features = model(image_tensor)
    return features.flatten().cpu().numpy()

# We apply our extract_features function to our training data
train_df['features'] = train_df['image_tensor'].apply(extract_features)





In [38]:
import re

#The entity value is string in nature so this function extracts float no from the string and also it differates the unit.
def split_entity_value(entity_value):
    try:
        # Use regex to separate the numeric value and the unit (e.g., '34 gram')
        match = re.match(r"([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", entity_value)
        
        if match:
            # Extract the numeric part and convert it to float
            numeric_value = float(match.group(1))
            # Extract the unit part (e.g., gram, centimetre)
            unit = match.group(2)
            return numeric_value, unit
        else:
            # If no match, return None for both
            return None, None
    except Exception as e:
        print(f"Error splitting entity_value '{entity_value}': {e}")
        return None, None

# Step 2: Apply the function to the train_df to create separate columns for numeric and unit
train_df['numeric_value'], train_df['unit'] = zip(*train_df['entity_value'].apply(split_entity_value))

# Drops the null values for train_df

train_df = train_df.dropna(subset=['numeric_value','features'])

# Getting y_train value which will be the numeric_value extracted
y_train = train_df['numeric_value']  


# Getting the x_train value which will be the features extracted
X_train = np.stack(train_df['features'].values)

        
    

In [39]:
# Our main goal is to use an regression model to predict entity_value based on the features provided.
# So basically from the train data.80% will be used to train the data and 20% used to test the data.

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


Randomn = RandomForestRegressor(n_estimators=100, random_state=42)

Randomn.fit(X_train_split, y_train_split)

# We make predections from the remaining 20 % of data from train_df.
val_predictions = Randomn.predict(X_val)


Validation predictions: [242.16416    495.94244667 338.62997    118.88649    153.71622333
 116.87017     52.15111667 101.72648    173.8763     149.65457
 129.21929333 184.18154667 333.18992333 125.70472333 131.00667333
 964.21328    170.5508     143.63981333 240.14147333 318.24435333]


In [40]:

# Now we make predections for test data given.So now we apply the image_tensor and image_link function on test_df

test_df['image_tensor'] = test_df['image_link'].apply(download_and_preprocess_image)
test_df['features'] = test_df['image_tensor'].apply(extract_features)
train_df = train_df.dropna(subset=['features'])

# Now we make predections for our test data
X_test = np.stack(test_df['features'].values)

#predection part
test_predictions = Randomn.predict(X_test)


# We make an new coloumn called preds which contain predections
test_predictions
test_df["preds"] = test_predictions

In [41]:
# Based on the unique entity_name we have created an map to map the units better.
entity_to_unit = {
    'height': 'centimetre',
    'width': 'centimetre',
    'depth': 'centimetre',
    'item_weight': 'gram',  
    'maximum_weight_recommendation': 'kilogram',
    'wattage': 'watt',
    'voltage': 'volt',
    'item_volume': 'millilitre', 
}
# This function will map an predection in previous step to its unit
def format_prediction(row):
    numeric_value = row["preds"]  
    entity_name = row['entity_name']          
    
  
    unit = entity_to_unit.get(entity_name, '')
    
  
    if numeric_value:
        return f"{numeric_value:.2f} {unit}" 
    else:
        return ""  
# We apply this function to test_df
test_df['formatted_prediction'] = test_df.apply(format_prediction, axis=1)

In [44]:
# We output this in an csv in the format expected.
output_df = pd.DataFrame({
    'prediction': test_df['formatted_prediction'], 
})


output_df.index = test_df['index']
# Main output  csv that will be created
output_filename = 'Output.csv'
output_df.to_csv(output_filename)

