### Basic library imports

In [None]:
!pip install pandas
!pip install numpy


### Read Dataset and Preprocessing of Train Dataset

In [None]:
import os
import urllib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import time
from PIL import Image

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

def common_mistake(unit):
    # Assuming the definition from your previous implementation
    if unit in allowed_units:
        return unit
    if unit.replace('ter', 'tre') in allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in allowed_units:
        return unit.replace('feet', 'foot')
    if unit == 'lbs':
        return 'pound'  # Fix 'lbs' to 'pound'
    return unit

def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        logging.error(f"Error creating placeholder image: {e}")

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return False

    filename = os.path.basename(image_link)
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return True

    for attempt in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            # Check if the downloaded image is valid
            with Image.open(image_save_path) as img:
                img.verify()  # Check for corrupted images
            return True
        except Exception as e:
            logging.warning(f"Failed to download {image_link} (Attempt {attempt+1}/{retries}): {e}")
            time.sleep(delay)

    # If download fails, create a placeholder image
    logging.error(f"Failed to download {image_link} after {retries} attempts. Creating placeholder.")
    create_placeholder_image(image_save_path)
    return False

# Load the data
train_df = pd.read_csv('data/train.csv')

# Explore the data
print(train_df.info())
print(train_df.describe())
sns.pairplot(train_df)
plt.show()

# Correlation analysis
numeric_columns = train_df.select_dtypes(include=[np.number])
correlation_matrix = numeric_columns.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

# Feature engineering (example: creating a new feature based on entity_name)
def categorize_entity_name(name):
    # Example categorization function assuming names are separated by underscores
    if isinstance(name, str):
        parts = name.split('_')
        return parts[0] if len(parts) > 0 else 'Unknown'
    return 'Unknown'

train_df['entity_name_category'] = train_df['entity_name'].apply(lambda x: categorize_entity_name(x))

# Preprocess the data (with additional handling for missing values)
def preprocess_data(df):
    df['entity_value'] = df['entity_value'].fillna('')
    df['value'] = df['entity_value'].apply(lambda x: x.split()[0] if x else np.nan)
    df['unit'] = df['entity_value'].apply(lambda x: x.split()[1] if x and len(x.split()) > 1 else np.nan)
    return df

train_df = preprocess_data(train_df)

# Sample a subset of images
num_images_to_download = 20000  # Specify the number of images to download
sampled_df = train_df.sample(n=num_images_to_download, random_state=1)  # Sample random subset

# Download images
image_links = sampled_df['image_link'].dropna().tolist()  # Extract image links
download_folder = 'data/images'  # Path where images will be saved

if not os.path.exists(download_folder):
    os.makedirs(download_folder)

total_links = len(image_links)
success_count = 0
fail_count = 0

for idx, image_link in enumerate(image_links):
    if download_image(image_link, download_folder):
        success_count += 1
    else:
        fail_count += 1

    # Report progress
    progress_percentage = (idx + 1) / total_links * 100
    logging.info(f"Progress: {progress_percentage:.2f}% - Success: {success_count} - Failed: {fail_count}")

# Save the preprocessed data with index
train_df.to_csv('data/preprocessed_train.csv', index=True, index_label='index')

print("Preprocessed data saved with index.")
print(train_df.head())


## Check the Number of Images in the directory

In [None]:
import os

def count_images(image_dir):
    # List all files in the directory
    files = os.listdir(image_dir)

    # Count files with common image extensions
    image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff')
    image_files = [f for f in files if f.lower().endswith(image_extensions)]

    return len(image_files)

image_dir = 'data/images'  # Change this to the path of your image directory
num_images = count_images(image_dir)
print(f"Number of images in the directory '{image_dir}': {num_images}")

## Correct the Image Paths

In [None]:
import os

def correct_image_path(path):
    # Remove any leading directories
    filename = os.path.basename(path)
    
    # Add .jpg extension if missing
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        filename += '.jpg'
    
    # Convert to lowercase
    filename = filename.lower()
    
    return filename

# Apply the correction to the DataFrame
test_df['corrected_image_path'] = test_df['image_link'].apply(correct_image_path)

# Check the results
print("Original vs Corrected paths:")
print(test_df[['image_link', 'corrected_image_path']].head())

# Check if the corrected paths match the files in the directory
image_dir = 'data/images'
dir_files = set(os.listdir(image_dir))
corrected_paths = set(test_df['corrected_image_path'])

matching_files = dir_files.intersection(corrected_paths)
print(f"\nNumber of matching files after correction: {len(matching_files)}")

if len(matching_files) > 0:
    print("First few matching files:")
    print(list(matching_files)[:5])
else:
    print("Still no matching files. Let's investigate further.")
    print("\nFirst few files in directory:")
    print(list(dir_files)[:5])
    print("\nFirst few corrected paths:")
    print(list(corrected_paths)[:5])

## Run the Model

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class EntityValueExtractor(nn.Module):
    def __init__(self, num_classes, num_units):
        super(EntityValueExtractor, self).__init__()
        self.base_model = models.resnet50(pretrained=True)
        in_features = self.base_model.fc.in_features
        self.base_model.fc = nn.Identity()

        self.fc1 = nn.Linear(in_features, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc_value = nn.Linear(512, 1)  # Regression for the value
        self.fc_unit = nn.Linear(512, num_units)  # Classification for the unit

    def forward(self, x):
        x = self.base_model(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        value = self.fc_value(x)
        unit = self.fc_unit(x)
        return value, unit

## Train the Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from src.model import EntityValueExtractor
from data_loader import get_data_loaders
from constants import allowed_units
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import logging
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm  # Import tqdm for progress bars

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

# Load preprocessed data
train_df = pd.read_csv('data/preprocessed_train.csv', low_memory=False)

# Data validation
print("DataFrame shape:", train_df.shape)
print("Columns:", train_df.columns)
print("Sample data:")
print(train_df.head())

# Check for missing values
print("\nMissing values:")
print(train_df.isnull().sum())

# Updated EntityValueDataset class
class EntityValueDataset(Dataset):
    def __init__(self, df, image_dir, transform=None): # Fixed the typo here
        self.df = df
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self): # Fixed the typo here
        return len(self.df)

    def __getitem__(self, idx): # Fixed the typo here
        image_filename = self.df.iloc[idx]['image_link'].split('/')[-1]
        image_path = os.path.join(self.image_dir, image_filename)

        try:
            image = Image.open(image_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
        except FileNotFoundError:
            logging.warning(f"Warning: Image not found: {image_path}")
            return None

        try:
            value = float(self.df.iloc[idx]['value'])
            value = torch.tensor(value, dtype=torch.float)
        except ValueError:
            logging.warning(f"Invalid value encountered at index {idx}, using default value.")
            value = torch.tensor(0.0, dtype=torch.float)

        unit = self.df.iloc[idx]['unit']
        allowed_units_list = list(allowed_units)  # Convert set to list
        if unit in allowed_units_list:
            unit = torch.tensor(allowed_units_list.index(unit), dtype=torch.long)
        else:
            logging.warning(f"Invalid unit encountered at index {idx}, using default unit.")
            unit = torch.tensor(0, dtype=torch.long)

        return image, value, unit

# Create data loader
def get_data_loaders(df, image_dir, batch_size, num_workers):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    dataset = EntityValueDataset(df, image_dir, transform=transform)

    def collate_fn(batch):
        # Filter out None batches due to missing images
        batch = list(filter(lambda x: x is not None, batch))
        if len(batch) > 0:
            return torch.utils.data.dataloader.default_collate(batch)
        else:
            return None

    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn)
    return loader

train_loader = get_data_loaders(train_df, 'data/images', batch_size=64, num_workers=4)  # Increased batch size and num_workers

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EntityValueExtractor(num_classes=1, num_units=len(allowed_units)).to(device)
value_criterion = nn.MSELoss()
unit_criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = GradScaler()  # Mixed precision scaler

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    num_batches = 0

    # Using tqdm for progress bar
    for i, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', ncols=100)):
        if batch is None or len(batch) == 0:
            continue

        images, values, units = batch
        images, values, units = images.to(device), values.to(device), units.to(device)

        optimizer.zero_grad()

        with autocast():  # Mixed precision context
            value_pred, unit_pred = model(images)
            value_loss = value_criterion(value_pred.squeeze(), values)
            unit_loss = unit_criterion(unit_pred, units)
            loss = value_loss + unit_loss

        scaler.scale(loss).backward()  # Scale loss and backward pass
        scaler.step(optimizer)  # Update parameters
        scaler.update()  # Update scaler

        train_loss += loss.item()
        num_batches += 1

        if i % 1000 == 0:  # Log every 1000 batches
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}")

    # Average loss for the epoch
    avg_loss = train_loss / num_batches if num_batches > 0 else 0
    print(f"Epoch {epoch+1}/{num_epochs}, Average Train Loss: {avg_loss:.4f}")



## Save the trained model

In [None]:
# Save the model
torch.save(model.state_dict(), 'model.pth')

## Make Predictions

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import requests
from io import BytesIO
import pandas as pd
from src.model import EntityValueExtractor
from data_loader import val_transform
from constants import allowed_units

# Define the dataset class to handle both local paths and URLs
class ProductImageDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.valid_images = []

        print(f"DataFrame shape: {self.df.shape}")
        
        # Ensure the 'image_link' column exists
        if 'image_link' not in self.df.columns:
            raise KeyError("'image_link' column is missing in the DataFrame")
        
        print("First few image links:")
        print(self.df['image_link'].head())
        print(f"Image directory: {self.img_dir}")

        for idx, row in self.df.iterrows():
            img_path = row['image_link']

            # Check if the path is a URL or local path
            if img_path.startswith('http'):
                try:
                    response = requests.get(img_path)
                    if response.status_code == 200:
                        self.valid_images.append(idx)
                    else:
                        print(f"Failed to access URL: {img_path}")
                except requests.exceptions.RequestException as e:
                    print(f"Error fetching image from URL {img_path}: {e}")
            else:
                local_img_path = os.path.join(self.img_dir, img_path)
                if os.path.exists(local_img_path):
                    self.valid_images.append(idx)
                else:
                    print(f"Image not found: {local_img_path}")

            if idx % 1000 == 0:
                print(f"Processed {idx} images...")

        print(f"Total valid images found: {len(self.valid_images)}")

    def __len__(self):
        return len(self.valid_images)

    def __getitem__(self, idx):
        img_idx = self.valid_images[idx]
        img_path = self.df.iloc[img_idx]['image_link']

        # Check if the path is a URL or local path
        if img_path.startswith('http'):
            try:
                response = requests.get(img_path)
                image = Image.open(BytesIO(response.content)).convert('RGB')
            except Exception as e:
                print(f"Error loading image from URL {img_path}: {e}")
                raise
        else:
            local_img_path = os.path.join(self.img_dir, img_path)
            try:
                image = Image.open(local_img_path).convert('RGB')
            except Exception as e:
                print(f"Error loading image at {local_img_path}: {e}")
                raise

        if self.transform:
            image = self.transform(image)

        return image

# Load the model
model = EntityValueExtractor(num_classes=1, num_units=len(allowed_units))
model.load_state_dict(torch.load('model.pth'))
model.eval()

# Load test data
test_df = pd.read_csv('data/test.csv')

# Create the dataset with image links (both URLs and local paths)
test_dataset = ProductImageDataset(test_df, 'data/images', transform=val_transform)

# Check how many valid images were found
print(f"Length of test_dataset: {len(test_dataset)}")

# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Check if the DataLoader is empty
if len(test_loader) == 0:
    print("test_loader is empty. Check ProductImageDataset")
    print(f"Length of test_df: {len(test_df)}")
    print(f"First few image links in test_df: {test_df['image_link'].head()}")
else:
    # Make predictions (only if test_loader is not empty)
    predictions = []
    with torch.no_grad():
        for i, images in enumerate(test_loader):
            print(f"Processing batch {i+1}/{len(test_loader)}")
            images = images.to(model.base_model.fc.weight.device)
            value_pred, unit_pred = model(images)

            value_pred = value_pred.squeeze().cpu().numpy()
            unit_pred = unit_pred.argmax(dim=1).cpu().numpy()

            for v, u in zip(value_pred, unit_pred):
                predictions.append(f"{v:.2f} {allowed_units[u]}")

            if i % 10 == 0:
                print(f"Current number of predictions: {len(predictions)}")

    print(f"Total number of predictions: {len(predictions)}")
    print(f"Number of rows in test_df: {len(test_df)}")

    # Create submission file
    submission = pd.DataFrame({
        'index': test_df['index'],
        'prediction': predictions
    })
    submission.to_csv('output/test_out.csv', index=False)


### Run Sanity check using src/sanity.py

In [5]:
!python sanity.py --test_filename ./data/sample_test.csv --output_filename ./data/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [6]:
!python sanity.py --test_filename ./data/sample_test.csv --output_filename ./data/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out_fail.csv


In [None]:
rm -rf ../