In [1]:
import pandas as pd
import numpy as np
import re
import os
import torch
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, StandardScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
14419,3.0,['B'],True,['Creature'],"['Rat', 'Ninja']","Ninjutsu {3}{B} ({3}{B}, Return an unblocked a...",['Ninjutsu'],3,2,mythic,False,2022-02-18,True,2.07
18897,3.0,['C'],False,['Artifact'],['none'],"{T}: Add {G} or {W}.\n{G/W}{G/W}{G/W}{G/W}, {T...",['no keywords'],0,0,common,False,2018-10-05,True,0.07
6598,2.0,['C'],False,['Artifact'],['none'],"{2}, {T}: Draw a card. Activate only if you co...",['no keywords'],0,0,rare,False,2023-08-04,False,0.72
9447,2.0,['G'],False,['Instant'],['none'],Target creature gets +3/+3 until end of turn. ...,['no keywords'],0,0,common,False,2019-07-12,True,0.02
18591,4.0,['R'],False,['Creature'],['Elemental'],"Bloodrush — {1}{R}{R}, Discard Scorchwalker: T...",['Bloodrush'],5,1,common,False,2013-02-01,True,0.03


#### Functions and models I am going to use in my pipeline

##### Models

In [3]:
subtype_model = Word2Vec(sentences=df['sub_types'], vector_size=100, window=5, min_count=1, workers=4)
type_model = Word2Vec(sentences=df['types'], vector_size=100, window=5, min_count=1, workers=4)
keyword_model = Word2Vec(sentences=df['keywords'], vector_size=100, window=5, min_count=1, workers=4)

oracle_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
oracle_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

##### word2vec functions

In [4]:
def word2vec_subtype_transformer(subtypes):
    return np.array([subtype_model.wv[subtype] if subtype in subtype_model.wv else np.zeros(100) for subtype in subtypes])

def word2vec_type_transformer(types):
    return np.array([type_model.wv[type] if type in type_model.wv else np.zeros(100) for type in types])

def word2vec_keywords_transformer(keywords):
    return np.array([keyword_model.wv[keyword] if keyword in keyword_model.wv else np.zeros(100) for keyword in keywords])


##### bert model and oracle_text functions

In [5]:
def process_oracle_text(oracle_text):

    color_mapping = {'{R}': 'Red ', '{U}': 'Blue ', '{G}': 'Green ', '{B}': 'Black ', '{W}': 'White ',
    '{C}': 'Colorless '}

    for token, color in color_mapping.items():
        oracle_text = oracle_text.replace(token, color)

    oracle_text = re.sub(r'\.', '. ', oracle_text)
    oracle_text = re.sub(r'{(\d+)}', lambda x:f'{x.group(1)} ', oracle_text)
    oracle_text = re.sub(r'{T}', 'Tap', oracle_text)
    oracle_text = re.sub(r'{X}', 'X ', oracle_text)
    oracle_text = re.sub("[\(\[].*?[\)\]]", "", oracle_text)
    oracle_text = oracle_text.replace('\n', ' ')

    return oracle_text

def distilbert_transformer(texts):
    embeddings = []
    for text in texts:
        inputs = oracle_tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = oracle_model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].tolist()
        embeddings.append(embedding[0])
    return np.array(embeddings)

##### Post-embedding function

In [6]:
def reshape_array(arr):
    return arr.reshape(768,)
def concatenate_embeddings(embeddings):
    return np.concatenate(embeddings, axis=1)

#### The Pipeline

In [15]:
numeric_transformer = make_pipeline(
    FunctionTransformer(lambda x: x.astype(int), validate=False),
    FunctionTransformer(lambda x: x.fillna(-1), validate=False)
)
date_transformer = make_pipeline(
    FunctionTransformer(lambda x: pd.to_datetime(x), validate=False),
    FunctionTransformer(lambda x: (x - x.min()).dt.days, validate=False),
    StandardScaler()
)
numeric_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, ['power', 'toughness']),
        ('date', date_transformer, ['released_at'])
    ]
)
numeric_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, ['power', 'toughness'])
    ]
)
numeric_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, ['power', 'toughness', 'cmc']),
        ('date', date_transformer, ['released_at'])
    ]
)
non_text_preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(), ['rarity']),
        ('colors', MultiLabelBinarizer(), ['colors']),
         ('legendary', LabelBinarizer(), ['legendary']),
        ('reserved', LabelBinarizer(), ['reserved']),
        ('booster', LabelBinarizer(), ['booster'])
    ],
    remainder='passthrough'
)
text_preprocessor = make_pipeline(
    FunctionTransformer(process_oracle_text, validate=False),
    ColumnTransformer([
        ('word2vec_subtype', FunctionTransformer(word2vec_subtype_transformer, validate=False), ['sub_types']),
        ('word2vec_type', FunctionTransformer(word2vec_type_transformer, validate=False), ['types']),
        ('word2vec_keywords', FunctionTransformer(word2vec_keywords_transformer, validate=False), ['keywords']),
        ('distilbert', FunctionTransformer(distilbert_transformer, validate=False), ['oracle_text']),
        ('reshape_distilbert', FunctionTransformer(reshape_array, validate=False), 'distilbert')
    ], remainder='passthrough')
)

combined_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('non_text',non_text_preprocessor, ['rarity', 'colors', 'legendary', 'reserved', 'booster']),
            ('num',numeric_preprocessor, ['power', 'toughness', 'released_at']),
            ('text', text_preprocessor, ['sub_types', 'types', 'keywords', 'oracle_text']),
            ('dummy', 'passthrough', ['sub_types', 'types', 'keywords', 'oracle_text'])
        ]
    ),
    FeatureUnion(
        transformer_list=[
            ('concatenate_embeddings', FunctionTransformer(concatenate_embeddings, validate=False), ['sub_types', 'types', 'keywords', 'oracle_text'])
        ]
    )
)

In [16]:
processed_df = combined_pipeline.fit_transform(df)

TypeError: MultiLabelBinarizer.fit_transform() takes 2 positional arguments but 3 were given