In [8]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from MTGpred.utils.mtgjson import simplify_name, parse_mana_cost
import re
import torch
import pandas as pd


class AllPickedCardsDataset(Dataset):
    def __init__(
        self,
        cards_df: pd.DataFrame,
        picked_cards: pd.DataFrame,
        model_name: str = "xlnet-base-cased",
        cased: bool = True,
        join_tokens: bool = True,
        max_length: int = 4096,
        truncation: bool = True,
        include_name: bool = True,
    ):
        """
        Dataset for decks classification

        Parameters
        ----------
        cards_df : pd.DataFrame
            Dataframe with all the cards
        picked_cards : pd.DataFrame
            Dataframe with all the picked cards
        model_name : str, optional
            Name of the transformer model, by default "xlnet-base-cased"
        cased : bool, optional
            Cased the card output or not, by default True
        join_tokens : bool, optional
            Join all texts and tokenize all or not, by default False
        max_length : int, optional
            Max length of the tokenizer, by default 256
        truncation : bool, optional
            Truncate the text or not, by default True
        """

        assert not join_tokens or (
            join_tokens and truncation
        ), "If join_tokens is True, truncation must be True"

        self.cards_df = cards_df
        self.data = picked_cards
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.cased = cased
        self.join_tokens = join_tokens
        self.max_length = max_length
        self.truncation = truncation
        self.include_name = include_name

    def __len__(self):
        return len(self.data)

    def preprocess_card(self, name: str):
        simplified_name = simplify_name(name)
        all_variations = []

        selected_card = self.cards_df[
            (self.cards_df["faceName"] == simplified_name)
            | (self.cards_df["name"] == simplified_name)
        ]
        if len(selected_card) == 0:
            print(
                f"WARNING: {name} cant be found in the database. Will be removed from the deck."
            )
            return []

        for index, variations in selected_card.iterrows():
            mana_cost = (
                parse_mana_cost(variations["manaCost"])
                if not pd.isna(variations["manaCost"])
                else ""
            )

            card_type = variations["type"]

            text = variations["text"] if not pd.isna(variations["text"]) else ""
            mana_in_text = re.findall(r"\{.*\}", text)
            for mana in mana_in_text:
                text = text.replace(mana, parse_mana_cost(mana))

            stats = f"{variations['power']} power, {variations['power']} power"

            if self.include_name:
                input_text = ". ".join([name, mana_cost, card_type, text, stats])
            else:
                input_text = ". ".join([mana_cost, card_type, text, stats])

            if not self.cased:
                input_text = input_text.lower()

            all_variations.append(input_text)

        return all_variations

    def get_tokenized_text(self, picks_data):
        all_cards = []

        for card in picks_data:
            all_cards.extend(self.preprocess_card(card))

        if self.join_tokens:
            all_cards = self.tokenizer.sep_token.join(all_cards)

            tokenized_deck = self.tokenizer(
                all_cards,
                padding="max_length",
                max_length=self.max_length,
                truncation=self.truncation,
            )
        else:
            tokenized_deck = []
            for card in all_cards:
                tokenized_deck.append(
                    self.tokenizer(
                        card,
                        return_tensors="pt",
                        padding="max_length",
                        max_length=self.max_length,
                        truncation=self.truncation,
                    )
                )

        return tokenized_deck

    def __getitem__(self, idx):
        picks_data = self.data.iloc[idx]

        picks = self.get_tokenized_text(picks_data["picks"])

        if self.join_tokens:
            picks["labels"] = torch.tensor(picks_data["wins"])

            return picks
        else:
            return {
                "input": picks,
                "label": picks_data["wins"],
                "mask_length": len(picks),
            }

In [9]:
import os
import pandas as pd


def get_drafted_data(data_folder: str, num_drafts: int = 10000):
    files = os.listdir(data_folder)
    total_drafts = 0
    drafts = []

    for f in files:
        df = pd.read_csv(os.path.join(data_folder, f))
        groups = df.groupby("draft_id")

        for draft_id, group in groups:
            drafts.append(
                pd.DataFrame(
                    {
                        "picks": [list(group["pick"])],
                        "wins": group["event_match_wins"].iloc[0],
                    }
                )
            )

        total_drafts += len(df["draft_id"].unique())

        if total_drafts >= num_drafts:
            break

    return pd.concat(drafts)


df = get_drafted_data(
    "C:/Users/javij/OneDrive/Escritorio/Proyectos/MTG_predictions/data/draft/draft_data/"
)

df

Unnamed: 0,picks,wins
0,"[Trostani, Three Whispers, Shock, Dog Walker, ...",4
0,"[Commercial District, Outrageous Robbery, Tunn...",6
0,"[Leering Onlooker, Deadly Cover-Up, Harried Dr...",1
0,"[Surveillance Monitor, Cold Case Cracker, Ceas...",6
0,"[Fugitive Codebreaker, Galvanize, Shock, Mistw...",4
...,...,...
0,"[Neighborhood Guardian, Seasoned Consultant, F...",1
0,"[Soul Enervation, Nightdrinker Moroii, Surveil...",2
0,"[Evidence Examiner, Axebane Ferox, Repulsive M...",1
0,"[Doppelgang, Hard Evidence, Tunnel Tipster, Vi...",7


In [10]:
df["wins"].value_counts()

1    2051
2    1926
0    1598
3    1566
4    1165
7    1061
5     806
6     565
Name: wins, dtype: int64

In [11]:
import json
from MTGpred.utils.mtgjson import load_cards_df
from sklearn.model_selection import train_test_split

cards_df = load_cards_df(data_path="../../data/AtomicCards.json")
train_cards, test_cards = train_test_split(df, test_size=0.2)

train_dataset = AllPickedCardsDataset(
    cards_df, train_cards, model_name="allenai/longformer-base-4096", cased=False
)
test_dataset = AllPickedCardsDataset(
    cards_df, test_cards, model_name="allenai/longformer-base-4096", cased=False
)

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import wandb

accuracy = evaluate.load("accuracy")

# Cargar métricas
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calcular métricas
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    f1_result = f1.compute(predictions=predictions, references=labels, average="macro")
    precision_result = precision.compute(
        predictions=predictions, references=labels, average="macro"
    )
    recall_result = recall.compute(
        predictions=predictions, references=labels, average="macro"
    )

    wandb.log(
        {
            "confusion_matrix": wandb.plot.confusion_matrix(
                probs=None, y_true=labels, preds=predictions
            )
        }
    )

    return {
        "accuracy": accuracy_result["accuracy"],
        "f1": f1_result["f1"],
        "precision": precision_result["precision"],
        "recall": recall_result["recall"],
    }


model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096", num_labels=df["wins"].nunique()
)

training_args = TrainingArguments(
    output_dir="../../models/draft-longformer",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    logging_steps=10,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

wandb.init(project="MTGpred-Drafts", entity="javier-jimenez99")
trainer.train()
wandb.finish()

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

***** Running training *****
  Num examples = 8590
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 268
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/268 [00:00<?, ?it/s]

Initializing global attention on CLS token...


RuntimeError: CUDA out of memory. Tried to allocate 1.13 GiB (GPU 0; 8.00 GiB total capacity; 3.85 GiB already allocated; 0 bytes free; 6.25 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF