In [None]:
!pip install pandas numpy scikit-learn tqdm tensorboard

In [1]:
import torch
import numpy as np
import pandas as pd
import glob
import os
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F

In [2]:
NUM_TAGS = 256

In [3]:
def get_track_idx(filename):
    return int(filename.split('/')[-1].split('.')[0])

def load_and_count_embeds(fn):
    embeds = np.load(fn)
    track_idx = get_track_idx(fn)
    return track_idx, embeds.shape[0], embeds

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_tags, num_layers=8, nhead=8, dim_feedforward=2048):
        super(TransformerModel, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(input_dim, num_tags)
        self.lin = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim),
            nn.LayerNorm(input_dim)
        )

    def forward(self, src, src_mask=None):
        src = src.permute(1, 0, 2)
        if src_mask is not None:
            src_key_padding_mask = ~src_mask
        else:
            src_key_padding_mask = None
        transformer_output = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)
        output = self.fc(self.lin(transformer_output[-1]))
        return output

class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * ((1 - pt) ** self.gamma) * BCE_loss
        return F_loss.sum()

In [5]:
def calculate_alpha_gamma(df, num_tags):
    label_counts = np.zeros(num_tags)
    for tags in df['tags']:
        indices = list(map(int, tags.split(',')))
        label_counts[indices] += 1

    alpha = 1 - (label_counts / np.sum(label_counts))
    gamma = 2 * np.ones(num_tags)

    return torch.tensor(alpha, dtype=torch.float32), torch.tensor(gamma, dtype=torch.float32)

In [6]:
def load_data():
    df_train = pd.read_csv('train.csv')
    df_test = pd.read_csv('test.csv')
    return df_train, df_test

In [7]:
def prepare_data(df, track_idx2embeds):
    X = []
    Y = []
    for index, row in df.iterrows():
        track_idx = row['track']
        embed = np.array(track_idx2embeds[track_idx], dtype=np.float32)
        tags = list(map(int, row['tags'].split(','))) if pd.notnull(row['tags']) else []
        y = np.zeros(NUM_TAGS)
        y[tags] = 1
        X.append(torch.tensor(embed, dtype=torch.float32))
        Y.append(torch.tensor(y, dtype=torch.float32))
    return list(zip(X, Y))

In [8]:
def collate_batch(batch):
    x_batch, y_batch = zip(*batch)
    lengths = torch.tensor([x.size(0) for x in x_batch], dtype=torch.long)
    x_padded = pad_sequence(x_batch, batch_first=True)
    y_padded = pad_sequence(y_batch, batch_first=True, padding_value=0)

    mask = torch.zeros(x_padded.size(0), x_padded.size(1), dtype=torch.bool)
    for i, length in enumerate(lengths):
        mask[i, :length] = 1

    return x_padded, y_padded, mask

In [9]:
def save_checkpoint(state, filename="checkpoint.pth"):
    torch.save(state, filename)

def load_checkpoint(filename):
    if os.path.isfile(filename):
        return torch.load(filename)
    else:
        return None

In [26]:
def train_model(model, train_loader, optimizer, criterion, writer, start_epoch, device, checkpoint_path=None):
    step = 0
    for epoch in tqdm(range(start_epoch, 5), desc="Epoch"):  # Assuming 10 is the total number of epochs
        model.train()
        pbar = tqdm(enumerate(train_loader), desc=f"Training Epoch {epoch}", total=len(train_loader))
        for i, (x, y, mask) in pbar:
            x, y, mask = x.to(device), y.to(device), mask.to(device)
            optimizer.zero_grad()
            outputs = model(x, mask)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + i)
            pbar.set_postfix({'loss' : loss.item()})
            step += 1

            # Save checkpoint every 100 steps
            if step % 500 == 0:
                save_checkpoint({
                    'epoch': epoch,
                    'step': step,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss,
                }, filename=checkpoint_path if checkpoint_path is not None else f"checkpoint{i}.pth")

        validate_model(model, val_loader, criterion, writer, epoch, device)


In [11]:
from sklearn.metrics import average_precision_score

def validate_model(model, val_loader, criterion, writer, epoch, device):
    model.eval()
    val_loss = 0.0
    all_average_precisions = []

    with torch.no_grad():
        for x, y, mask in tqdm(val_loader, desc="Validating"):
            x, y, mask = x.to(device), y.to(device), mask.to(device)
            outputs = model(x, mask)
            loss = criterion(outputs, y)
            val_loss += loss.item()

            # Calculate average precision score
            y_true = y.cpu().detach().numpy()
            y_pred = torch.sigmoid(outputs).cpu().detach().numpy()
            average_precision = average_precision_score(y_true, y_pred)
            all_average_precisions.append(average_precision)

    avg_val_loss = val_loss / len(val_loader)
    avg_precision = sum(all_average_precisions) / len(all_average_precisions)

    writer.add_scalar('Loss/val', avg_val_loss, epoch)
    writer.add_scalar('AveragePrecision/val', avg_precision, epoch)
    print(f"{avg_val_loss=}, {avg_precision=}")
    return avg_val_loss, avg_precision

In [12]:
def generate_predictions(model, df_test, track_idx2embeds, checkpoint_path=None):
    if checkpoint_path:
        checkpoint = load_checkpoint(checkpoint_path)
        if checkpoint:
            model.load_state_dict(checkpoint['model_state_dict'])

    predictions_list = []
    model.eval()
    with torch.no_grad():
        for index, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Generating predictions", leave=True):
            track_idx = row['track']
            embed = np.array(track_idx2embeds[track_idx], dtype=np.float32)
            x = torch.tensor(embed, dtype=torch.float32).unsqueeze(0).to(device)
            outputs = model(x)
            predictions = torch.sigmoid(outputs).squeeze().tolist()
            predictions_list.append([track_idx, ",".join(map(str, predictions))])
    return predictions_list

In [13]:
# Main Execution
df_train, df_test = load_data()

In [14]:
from skmultilearn.model_selection import iterative_train_test_split

# Convert tags to a binary matrix
Y = np.zeros((df_train.shape[0], NUM_TAGS), dtype=int)
for idx, tags in enumerate(df_train['tags']):
    indices = list(map(int, tags.split(',')))
    Y[idx, indices] = 1

# Splitting the data
X_train, y_train, X_val, y_val = iterative_train_test_split(df_train.values, Y, test_size=0.1)

# Convert back to DataFrame
df_train = pd.DataFrame(X_train, columns=df_train.columns)
df_train['tags'] = [','.join(map(str, np.where(row == 1)[0])) for row in y_train]

df_val = pd.DataFrame(X_val, columns=df_train.columns)
df_val['tags'] = [','.join(map(str, np.where(row == 1)[0])) for row in y_val]

In [15]:
alpha, gamma = calculate_alpha_gamma(df_train, NUM_TAGS)

In [16]:
file_list = glob.glob('track_embeddings/*.npy')
track_idx2embeds = {}
with ThreadPoolExecutor() as executor:
    for track_idx, num_embeds, embeds in tqdm(executor.map(load_and_count_embeds, file_list), total=len(file_list)):
        track_idx2embeds[track_idx] = embeds

  0%|          | 0/76714 [00:00<?, ?it/s]

In [17]:
train_data = prepare_data(df_train, track_idx2embeds)
val_data = prepare_data(df_val, track_idx2embeds)

In [18]:
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=128, collate_fn=collate_batch)

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(input_dim=768, num_tags=NUM_TAGS).to(device)



In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
criterion = FocalLoss(alpha=alpha.to(device), gamma=gamma.to(device))
writer = SummaryWriter()

In [21]:
checkpoint_path = None #"checkpoint.pth"
checkpoint = load_checkpoint(checkpoint_path) if checkpoint_path is not None else None

if checkpoint is not None:
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    step = checkpoint['step']
else:
    start_epoch = 0
    step = 0

In [22]:
import warnings 
warnings.filterwarnings("ignore") # shutup sklearn

In [23]:
import gc
torch.cuda.empty_cache()
gc.collect()

6

In [27]:
train_model(model, train_loader, optimizer, criterion, writer, start_epoch, device)
validate_model(model, val_loader, criterion, writer, start_epoch, device)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/360 [00:00<?, ?it/s]

Validating:   0%|          | 0/41 [00:00<?, ?it/s]

avg_val_loss=430.11662999595086, avg_precision=0.2477226595272815


Training Epoch 1:   0%|          | 0/360 [00:00<?, ?it/s]

Validating:   0%|          | 0/41 [00:00<?, ?it/s]

avg_val_loss=425.66731150557354, avg_precision=0.24978967571473462


Training Epoch 2:   0%|          | 0/360 [00:00<?, ?it/s]

Validating:   0%|          | 0/41 [00:00<?, ?it/s]

avg_val_loss=422.8435694996904, avg_precision=0.25323879912449315


Training Epoch 3:   0%|          | 0/360 [00:00<?, ?it/s]

Validating:   0%|          | 0/41 [00:00<?, ?it/s]

avg_val_loss=422.12120316668256, avg_precision=0.25516544807983904


Training Epoch 4:   0%|          | 0/360 [00:00<?, ?it/s]

Validating:   0%|          | 0/41 [00:00<?, ?it/s]

avg_val_loss=417.11289978027344, avg_precision=0.2590898119911534


Validating:   0%|          | 0/41 [00:00<?, ?it/s]

avg_val_loss=417.11289978027344, avg_precision=0.2590898119911534


(417.11289978027344, 0.2590898119911534)

In [None]:
writer.close()

In [28]:
predictions_list = generate_predictions(model, df_test, track_idx2embeds, checkpoint_path)
df_predictions = pd.DataFrame(predictions_list, columns=["track", "prediction"])
df_predictions.to_csv("prediction_09_40_2.csv", index=False)

Generating predictions:   0%|          | 0/25580 [00:00<?, ?it/s]