In [1]:
import pandas as pd
import numpy as np
import re
import os
import torch
from ast import literal_eval
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, StandardScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,name,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
5527,Diregraf Ghoul,1,['B'],False,['Creature'],['Zombie'],Diregraf Ghoul enters the battlefield tapped.,['no keywords'],2,2,uncommon,False,8990,True,0.06
21759,Take Up Arms,5,['W'],False,['Instant'],['none'],Create three 1/1 white Warrior creature tokens.,['no keywords'],0,0,uncommon,False,8955,True,0.03
3852,Cloud Crusader,4,['W'],False,['Creature'],"['Human', 'Knight']",Flying\nFirst strike (This creature deals comb...,"['Flying', 'First strike']",2,3,common,False,6071,True,0.13
14394,Mystic Melting,4,['G'],False,['Instant'],['none'],Destroy target artifact or enchantment.\nDraw ...,['no keywords'],0,0,uncommon,False,4615,True,0.23
21479,Surestrike Trident,2,['C'],False,['Artifact'],['Equipment'],"Equipped creature has first strike and ""{T}, U...",['Equip'],0,0,uncommon,False,3719,True,0.74


#### Functions and models used for data processing

In [3]:
df['colors'] = df['colors'].apply(literal_eval)

In [4]:
mlb = MultiLabelBinarizer()
df= df.join(pd.DataFrame(mlb.fit_transform(df['colors']),
                                      columns=mlb.classes_,
                                      index=df.index))
legend_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['legendary']), columns=['legendary'])
             .add_prefix('_'))
booster_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['booster']), columns=['booster'])
             .add_prefix('_'))
reserved_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['reserved']), columns=['resrved'])
             .add_prefix('_'))

df = df.drop(['legendary', 'booster', 'reserved', 'colors'], axis=1)
df.head(3)

Unnamed: 0,name,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,...,B,C,G,N,R,U,W,_legendary,_booster,_resrved
0,+2 Mace,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2.\nEquip {3} ({3}:...,['Equip'],0,0,common,10096,...,0,0,0,0,0,0,1,0,1,0
1,Aarakocra Sneak,4,['Creature'],"['Bird', 'Rogue']",Flying\nWhen Aarakocra Sneak enters the battle...,['Flying'],1,4,common,10418,...,0,0,0,0,0,1,0,0,1,0
2,Abaddon the Despoiler,5,['Creature'],"['Astartes', 'Warrior']",Trample\nMark of Chaos Ascendant — During your...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,...,1,0,0,0,1,1,0,1,0,0


##### Word 2 Vec Models

In [5]:
subtype_model = Word2Vec(sentences=df['sub_types'], vector_size=100, window=5, min_count=1, workers=4)
type_model = Word2Vec(sentences=df['types'], vector_size=100, window=5, min_count=1, workers=4)
keyword_model = Word2Vec(sentences=df['keywords'], vector_size=100, window=5, min_count=1, workers=4)

In [6]:
df['sub_types']=df['sub_types'].apply(lambda subtypes: np.sum([subtype_model.wv[word] for word in subtypes], axis=0))
df['types']=df['types'].apply(lambda types: np.sum([type_model.wv[word] for word in types], axis=0))
df['keywords']=df['keywords'].apply(lambda keywords: np.sum([keyword_model.wv[word] for word in keywords], axis=0))

#### Oracle text processing

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].tolist()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25545 entries, 0 to 25544
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         25545 non-null  object 
 1   cmc          25545 non-null  int64  
 2   types        25545 non-null  object 
 3   sub_types    25545 non-null  object 
 4   oracle_text  25545 non-null  object 
 5   keywords     25545 non-null  object 
 6   power        25545 non-null  int64  
 7   toughness    25545 non-null  int64  
 8   rarity       25545 non-null  object 
 9   released_at  25545 non-null  int64  
 10  usd          25489 non-null  float64
 11  B            25545 non-null  int64  
 12  C            25545 non-null  int64  
 13  G            25545 non-null  int64  
 14  N            25545 non-null  int64  
 15  R            25545 non-null  int64  
 16  U            25545 non-null  int64  
 17  W            25545 non-null  int64  
 18  _legendary   25545 non-null  int64  
 19  _boo

In [9]:
df['text_embedding'] = df['oracle_text'].apply(get_embeddings)

In [None]:
df= df[['cmc', 'B', 'C', 'G', 'N', 'R', 'U', 'W', 'legendary_',
       'power','toughness', 'reserved_', 'rarity_common', 'rarity_mythic',
       'rarity_rare', 'rarity_special', 'rarity_uncommon', 'released_at',
       'booster_','types', 'sub_type', 'keywords',
       'text_embedding', 'usd']]

#### The Pipeline

In [None]:
rarity_preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(), ['rarity'])],
    remainder='passthrough'
)
numeric_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), ['cmc', 'power', 'toughness', 'released_at']),
    ]
)
combined_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num',numeric_preprocessor, ['power', 'toughness', 'released_at']),
            ('rarity', rarity_preprocessor,['rarity']),
        ]
    )
)


In [None]:
##### Post-embedding function
def reshape_array(arr):
    return arr.reshape(768,)
def concatenate_embeddings(embeddings):
    return np.column_stack(embeddings, axis=1)

In [None]:
X = df.drop(columns='usd')
X_processed = combined_pipeline.fit_transform(X)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 25424 and the array at index 1 has size 1