### Step 1: Data Preparation

First, let's import the necessary libraries and load the data.

In [None]:
import os
import tqdm
import pandas as pd
import numpy as np
from PIL import Image
from torchvision import models, transforms
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load the CSV files
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Define the image directories
train_image_dir = 'data/train_images/'
test_image_dir = 'data/test_images/'

# Extract target columns and ancillary data columns
target_columns = ['X4_mean', 'X11_mean', 'X18_mean',
                  'X26_mean', 'X50_mean', 'X3112_mean']
ancillary_columns = [
    col for col in train_df.columns if col not in ['id'] + target_columns]

# Extract the ids
train_ids = train_df['id']
test_ids = test_df['id']

### Step 2: Feature Extraction

We'll use a pre-trained ResNet model to extract features from the images using PyTorch.

In [None]:
# Define image size and batch size
IMG_SIZE = (128, 128)
BATCH_SIZE = 32

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transforms for preprocessing
preprocess = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


# Custom dataset class
class PlantDataset(Dataset):
    def __init__(self, image_dir, ids, transform=None):
        self.image_dir = image_dir
        self.ids = ids
        self.transform = transform

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_path = os.path.join(self.image_dir, f"{img_id}.jpeg")
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image


# Create datasets and dataloaders
train_dataset = PlantDataset(train_image_dir, train_ids, transform=preprocess)
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

test_dataset = PlantDataset(test_image_dir, test_ids, transform=preprocess)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Load pre-trained ResNet50 model
resnet_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
resnet_model.fc = nn.Identity()  # Remove the final fully connected layer
resnet_model = resnet_model.to(device)


# Extract features
def extract_features(dataloader, model, device):
    model.eval()
    features = []
    with torch.no_grad():
        for images in tqdm.tqdm(dataloader, desc="Extracting Features"):
            images = images.to(device)
            outputs = model(images)
            features.append(outputs.cpu())
    features = torch.cat(features).cpu().numpy()
    return features


train_features = extract_features(train_loader, resnet_model, device)
test_features = extract_features(test_loader, resnet_model, device)

### Step 3: Model Building

Combine the image features with the ancillary data.

In [None]:
# Combine image features with ancillary data
train_ancillary_data = train_df[ancillary_columns].values
test_ancillary_data = test_df[ancillary_columns].values

train_combined_features = np.hstack((train_features, train_ancillary_data))
test_combined_features = np.hstack((test_features, test_ancillary_data))

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_combined_features,
    train_df[target_columns].values,
    test_size=0.2,
    random_state=42
)

### Step 4: Training

Train a Random Forest Regressor on the combined features.

In [None]:
# Train a Random Forest Regressor for each target trait
models = {}
for i, target in tqdm.tqdm(enumerate(target_columns), total=len(target_columns), desc="Training Models"):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train[:, i])
    models[target] = model

    # Validate the model
    y_pred = model.predict(X_val)
    r2 = r2_score(y_val[:, i], y_pred)
    print(f"R2 score for {target}: {r2}")

### Step 5: Prediction

Use the trained models to predict the traits for the test data.

In [None]:
# Predict the traits for the test data
test_predictions = {}
for target in tqdm.tqdm(target_columns, desc="Predicting Traits"):
    test_predictions[target] = models[target].predict(test_combined_features)

# Prepare the submission file
submission = pd.DataFrame({
    'id': test_ids,
    'X4': test_predictions['X4_mean'],
    'X11': test_predictions['X11_mean'],
    'X18': test_predictions['X18_mean'],
    'X26': test_predictions['X26_mean'],
    'X50': test_predictions['X50_mean'],
    'X3112': test_predictions['X3112_mean']
})

submission.to_csv('submission.csv', index=False)