In [1]:
import pandas as pd
import numpy as np
import re
import os
import torch
from ast import literal_eval
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, StandardScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
9300,1,['B'],False,['Creature'],"['Human', 'Pirate']","When Greedy Freebooter dies, scry 1 and create...","['Treasure', 'Scry']",1,1,common,False,10943,True,0.1
6834,3,['C'],False,"['Artifact', 'Creature']",['Soldier'],Metalcraft — Etched Champion has protection fr...,['Metalcraft'],2,2,rare,False,7842,True,0.67
17254,4,['R'],False,['Sorcery'],['none'],Choose one — • Destroy target Plains and targe...,['no keywords'],0,0,uncommon,False,1042,True,0.24
4554,3,['U'],False,"['Enchantment', 'Creature']",['Nautilus'],Bestow 3 Blue Blue When Crystalline Nautilus...,['Bestow'],4,4,uncommon,False,7457,True,0.07
20619,4,['R'],False,['Enchantment'],['Aura'],"Enchant creature Enchanted creature has ""Tap: ...",['Enchant'],0,0,rare,False,7842,True,8.42


#### Functions and models I am going to use in my pipeline

##### Models

In [3]:
subtype_model = Word2Vec(sentences=df['sub_types'], vector_size=100, window=5, min_count=1, workers=4)
type_model = Word2Vec(sentences=df['types'], vector_size=100, window=5, min_count=1, workers=4)
keyword_model = Word2Vec(sentences=df['keywords'], vector_size=100, window=5, min_count=1, workers=4)

oracle_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
oracle_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

##### embedding extrators

In [4]:
def word2vec_subtype_transformer(subtypes):
    return np.array([subtype_model.wv[subtype] if subtype in subtype_model.wv else np.zeros(100) for subtype in subtypes])

def word2vec_type_transformer(types):
    return np.array([type_model.wv[type] if type in type_model.wv else np.zeros(100) for type in types])

def word2vec_keywords_transformer(keywords):
    return np.array([keyword_model.wv[keyword] if keyword in keyword_model.wv else np.zeros(100) for keyword in keywords])


In [5]:
def distilbert_transformer(texts):
    embeddings = []
    for text in texts:
        inputs = oracle_tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = oracle_model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].tolist()
        embeddings.append(embedding[0])
    return np.array(embeddings)

##### Post-embedding function

In [6]:
def reshape_array(arr):
    return arr.reshape(768,)
def concatenate_embeddings(embeddings):
    return np.concatenate(embeddings, axis=1)

##### Label Binarizer does not like cooperating with the pipeline, importing as csv also chnages some of the formatting

In [7]:
df['colors'] = df['colors'].apply(literal_eval)
df['colors'].head(3)

0          [W]
1          [U]
2    [B, R, U]
Name: colors, dtype: object

In [8]:
mlb = MultiLabelBinarizer()
df= df.join(pd.DataFrame(mlb.fit_transform(df['colors']),
                                      columns=mlb.classes_,
                                      index=df.index))
df.drop(columns='colors', inplace=True)
df.head(3)

Unnamed: 0,cmc,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd,B,C,G,N,R,U,W
0,2,False,['Artifact'],['Equipment'],Equipped creature gets +2/+2. Equip 3,['Equip'],0,0,common,False,10096,True,0.02,0,0,0,0,0,0,1
1,4,False,['Creature'],"['Bird', 'Rogue']",Flying When Aarakocra Sneak enters the battlef...,['Flying'],1,4,common,False,10418,True,0.06,0,0,0,0,0,1,0
2,5,True,['Creature'],"['Astartes', 'Warrior']",Trample Mark of Chaos Ascendant — During your ...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,False,10537,False,2.81,1,0,0,0,1,1,0


In [9]:
legend_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['legendary']), columns=['legendary'])
             .add_prefix('_'))
df = df.drop('legendary', axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd,B,C,G,N,R,U,W,_legendary
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2. Equip 3,['Equip'],0,0,common,False,10096,True,0.02,0,0,0,0,0,0,1,0
1,4,['Creature'],"['Bird', 'Rogue']",Flying When Aarakocra Sneak enters the battlef...,['Flying'],1,4,common,False,10418,True,0.06,0,0,0,0,0,1,0,0
2,5,['Creature'],"['Astartes', 'Warrior']",Trample Mark of Chaos Ascendant — During your ...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,False,10537,False,2.81,1,0,0,0,1,1,0,1


In [10]:
booster_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['booster']), columns=['booster'])
             .add_prefix('_'))
df = df.drop('booster', axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2. Equip 3,['Equip'],0,0,common,False,10096,0.02,0,0,0,0,0,0,1,0,1
1,4,['Creature'],"['Bird', 'Rogue']",Flying When Aarakocra Sneak enters the battlef...,['Flying'],1,4,common,False,10418,0.06,0,0,0,0,0,1,0,0,1
2,5,['Creature'],"['Astartes', 'Warrior']",Trample Mark of Chaos Ascendant — During your ...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,False,10537,2.81,1,0,0,0,1,1,0,1,0


In [11]:
reserved_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['reserved']), columns=['resrved'])
             .add_prefix('_'))
df = df.drop('reserved', axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2. Equip 3,['Equip'],0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0
1,4,['Creature'],"['Bird', 'Rogue']",Flying When Aarakocra Sneak enters the battlef...,['Flying'],1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0
2,5,['Creature'],"['Astartes', 'Warrior']",Trample Mark of Chaos Ascendant — During your ...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0


In [48]:
df.columns

Index(['cmc', 'types', 'sub_types', 'oracle_text', 'keywords', 'power',
       'toughness', 'rarity', 'released_at', 'usd', 'B', 'C', 'G', 'N', 'R',
       'U', 'W', '_legendary', '_booster', '_resrved'],
      dtype='object')

#### The Pipeline

In [16]:
rarity_preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(), ['rarity'])],
    remainder='passthrough'
)
numeric_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), ['power', 'toughness', 'released_at']),
    ]
)
text_preprocessor = make_pipeline(
    ColumnTransformer([
        ('word2vec_subtype', FunctionTransformer(word2vec_subtype_transformer, validate=False), ['sub_types']),
        ('word2vec_type', FunctionTransformer(word2vec_type_transformer, validate=False), ['types']),
        ('word2vec_keywords', FunctionTransformer(word2vec_keywords_transformer, validate=False), ['keywords']),
        ('distilbert', FunctionTransformer(distilbert_transformer, validate=False), ['oracle_text']),
    ], remainder='passthrough')
)
combined_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num',numeric_preprocessor, ['power', 'toughness', 'released_at']),
            ('text', text_preprocessor, ['sub_types', 'types', 'keywords', 'oracle_text']),
            ('rarity', rarity_preprocessor,['rarity']),
            ('dummy', 'passthrough', ['sub_types', 'types', 'keywords', 'oracle_text'])
        ]
    ),
    FeatureUnion(
        transformer_list=[
            ('concatenate_embeddings', FunctionTransformer(concatenate_embeddings, validate=False), ['sub_types', 'types', 'keywords', 'oracle_text'])
        ]
    )
)

In [17]:
processed_df = combined_pipeline.fit_transform(df)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 25424 and the array at index 1 has size 1