In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from google.colab import drive
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
TRAIN_CSV_PATH = "/content/drive/MyDrive/open (4)/train.csv"
TEST_CSV_PATH = "/content/drive/MyDrive/open (4)/test.csv"
TRAIN_DIR = "/content/drive/MyDrive/open (4)/train"
TEST_DIR = "/content/drive/MyDrive/open (4)/test"
SUBMISSION_PATH = "submission.csv"

In [None]:
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)

train_df["path"] = train_df["path"].apply(lambda x: os.path.join(TRAIN_DIR, x.replace("./train/", "")))
test_df["path"] = test_df["path"].apply(lambda x: os.path.join(TEST_DIR, x.replace("./test/", "")))


print(train_df.head())
print(test_df.head())

print(f"Train DataFrame Shpae: {train_df.shape}")
print(f"Test DataFrame Shpae: {test_df.shape}")
print(train_df.head())

           ID                                               path  AL645608.7  \
0  TRAIN_0000  /content/drive/MyDrive/open (4)/train/TRAIN_00...    0.000506   
1  TRAIN_0001  /content/drive/MyDrive/open (4)/train/TRAIN_00...    0.000506   
2  TRAIN_0002  /content/drive/MyDrive/open (4)/train/TRAIN_00...   -0.000415   
3  TRAIN_0003  /content/drive/MyDrive/open (4)/train/TRAIN_00...   -0.000855   
4  TRAIN_0004  /content/drive/MyDrive/open (4)/train/TRAIN_00...    0.000506   

       HES4  TNFRSF18   TNFRSF4      SDF4     ACAP3    INTS11     MXRA8  ...  \
0  0.010635 -0.000213 -0.000846  1.512467  0.021131  0.024409  0.004109  ...   
1  0.010635 -0.000213 -0.000846  1.508787  0.021131  0.024409  0.004109  ...   
2  0.005658 -0.000413  0.003148  0.109204  0.013978  0.049823  0.005327  ...   
3  0.004366  0.000684  0.000865  0.503090  0.295115  0.303922 -0.004290  ...   
4  0.010635 -0.000213 -0.000846  0.905195  0.021131  1.597454  0.004109  ...   

    MT-ATP8   MT-ATP6    MT-CO3    MT-

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
def safe_load_image(image_path):
    try:
        return Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return Image.new("RGB", (224, 224))

In [None]:
class GeneExpressionDataset(Dataset):
    def __init__(self, df, image_dir, transform=None, is_train=True):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.is_train = is_train
        self.labels = df.iloc[:, 2:].values.astype(np.float32) if is_train else None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.df.iloc[idx, 1])
        image = safe_load_image(img_path)
        if self.transform:
            image = self.transform(image)
        if self.is_train:
            label = torch.tensor(self.labels[idx])
            return image, label
        return image

In [None]:
train_dataset = GeneExpressionDataset(train_df, TRAIN_DIR, transform, is_train=True)
test_dataset = GeneExpressionDataset(test_df, TEST_DIR, transform, is_train=False)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)

In [None]:
class GeneExpressionModel(nn.Module):
    def __init__(self):
        super(GeneExpressionModel, self).__init__()
        self.model = models.efficientnet_b0(pretrained=True)
        self.model.classifier[1] = nn.Linear(1280, 3467)

    def forward(self, x):
        return self.model(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GeneExpressionModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)



In [None]:
def train_model(model, train_loader, criterion, optimizer, epochs=10, checkpoint_interval=5, checkpoint_path="checkpoint.pth"):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / len(train_loader))

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


        if (epoch + 1) % checkpoint_interval == 0:
            torch.save(model.state_dict(), f"{checkpoint_path}_epoch{epoch+1}.pth")
            print(f"Checkpoint saved at epoch {epoch+1}")

In [None]:
train_model(model, train_loader, criterion, optimizer, epochs=30, checkpoint_interval=5, checkpoint_path="checkpoint")



Epoch 1/30, Loss: 0.0668




Epoch 2/30, Loss: 0.0493




Epoch 3/30, Loss: 0.0482




Epoch 4/30, Loss: 0.0476




Epoch 5/30, Loss: 0.0471
Checkpoint saved at epoch 5




Epoch 6/30, Loss: 0.0467




Epoch 7/30, Loss: 0.0464




Epoch 8/30, Loss: 0.0462




Epoch 9/30, Loss: 0.0459




Epoch 10/30, Loss: 0.0457
Checkpoint saved at epoch 10




Epoch 11/30, Loss: 0.0455




Epoch 12/30, Loss: 0.0453




Epoch 13/30, Loss: 0.0452




Epoch 14/30, Loss: 0.0451




Epoch 15/30, Loss: 0.0450
Checkpoint saved at epoch 15




Epoch 16/30, Loss: 0.0449




Epoch 17/30, Loss: 0.0448




Epoch 18/30, Loss: 0.0447




Epoch 19/30, Loss: 0.0446




Epoch 20/30, Loss: 0.0445
Checkpoint saved at epoch 20




Epoch 21/30, Loss: 0.0444




Epoch 22/30, Loss: 0.0443




Epoch 23/30, Loss: 0.0442




Epoch 24/30, Loss: 0.0441




Epoch 25/30, Loss: 0.0440
Checkpoint saved at epoch 25




Epoch 26/30, Loss: 0.0438




Epoch 27/30, Loss: 0.0436




Epoch 28/30, Loss: 0.0435




Epoch 29/30, Loss: 0.0434


                                                                           

Epoch 30/30, Loss: 0.0433
Checkpoint saved at epoch 30




In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for images in tqdm(test_loader, desc="Generating Predictions"):
        images = images.to(device)
        outputs = model(images)
        predictions.append(outputs.cpu().numpy())

def predict_and_submit(model, test_loader, submission_path):
    model.eval()
    predictions = []
    with torch.no_grad():
        for images in tqdm(test_loader, desc="Predicting for Submission"):
            images = images.to(device)
            outputs = model(images).cpu().numpy()
            predictions.append(outputs)
    predictions = np.vstack(predictions)
    submission = pd.DataFrame(predictions, columns=train_df.columns[2:])
    submission.insert(0, "ID", test_df["ID"])
    submission.to_csv(submission_path, index=False)
    print(f"Submission file saved: {submission_path}")

predict_and_submit(model, test_loader, "submission.csv")

Generating Predictions: 100%|██████████| 143/143 [00:38<00:00,  3.76it/s]
Predicting for Submission: 100%|██████████| 143/143 [00:37<00:00,  3.79it/s]


Submission file saved: submission.csv
