In [7]:
import pandas as pd
import os
import re
import torch
from torchvision import models, transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import requests
import io
import numpy as np


In [None]:
train_df = pd.read_csv('dataset/train.csv') # So this Train_df contains coloumns like index,image_link,group_id,entity_name,  entity_value(target value)



test_df = pd.read_csv('dataset/test.csv') # So this is the test_df coloumns like index,image_link,group_id,entity_name Target - entity value


In [None]:
# This is one of the preprocessing steps to perform.
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),          
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  
])


# This function will take the image_url and convert it to tensor.So basically,tensor is more easier to work with the model we are going to use ahead.
def download_and_preprocess_image(image_url):
    try:
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')  
        image_tensor = preprocess(image).unsqueeze(0) 
        return image_tensor
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return None

# We apply this function to our training data
train_df['image_tensor'] = train_df['image_link'].apply(download_and_preprocess_image)
# Now the train_df has the image_tensor coloumn


In [None]:
# We will be using resnet50 model from pytorch
model = models.resnet50(pretrained=True)
model.eval()

# This function takes in the image tensor and returns with the features(array) from the image tensor
def extract_features(image_tensor):
    with torch.no_grad():
        features = model(image_tensor)
    return features.flatten().cpu().numpy()

# We apply our extract_features function to our training data
train_df['features'] = train_df['image_tensor'].apply(extract_features)

# We prepare the X_train and Y_train using numpy
X_train = np.stack(train_df['features'].values)

y_train = train_df['entity_value']  

In [None]:
# So basically from the train data.80% will be used to train the data and 20% used to test the data.
# Ok so this model uses random decsion trees to genrate predections.
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train_split, y_train_split)

# We make predections from the remaining 20 % of data from train_df.
val_predictions = model.predict(X_val)
print(f"Validation predictions: {val_predictions}")

In [None]:

# Now we make predections for new data.So now we get image tensor and features as we did with train_df.

test_df['image_tensor'] = test_df['image_link'].apply(download_and_preprocess_image)
test_df['features'] = test_df['image_tensor'].apply(extract_features)


# Now we make predections for our test data
X_test = np.stack(test_df['features'].values)

#predection part
test_predictions = model.predict(X_test)


# This function will just format the predections
def format_prediction(pred):
    return f"{pred:.2f}"  

test_df['prediction'] = test_predictions.apply(format_prediction)

In [None]:
# The format requested in the problem statement 
output_df = pd.DataFrame({
    'index': test_df['index'],
    'prediction': test_df['prediction']
})

# Main csv that will be created
output_filename = 'final_predictions.csv'
output_df.to_csv(output_filename)

