In [1]:
import pandas as pd
import numpy as np
import re
import os
import torch
from ast import literal_eval
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras import models, layers
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
148,5,"['B', 'R', 'U']",True,['Creature'],"['Human', 'Pirate']","When Admiral Brass, Unsinkable enters the batt...",['Mill'],3,3,mythic,False,10943,False,0.48
10392,3,['G'],False,['Enchantment'],['none'],"At the beginning of combat on your turn, targe...",['no keywords'],0,0,rare,False,10215,True,0.31
22468,3,['R'],False,['Creature'],"['Bird', 'Beast']","Flying\nAt the beginning of your upkeep, if a ...",['Flying'],2,3,uncommon,False,3232,True,0.1
761,3,['G'],False,['Creature'],"['Frog', 'Beast']",Threshold — Anurid Barkripper gets +2/+2 as lo...,['Threshold'],2,2,common,False,3099,True,0.14
1268,2,['W'],False,['Instant'],['none'],Put target enchantment on top of its owner's l...,['Cycling'],0,0,uncommon,False,3232,True,0.15


#### Functions and models used for data processing

In [3]:
df['colors'] = df['colors'].apply(literal_eval)

In [4]:
mlb = MultiLabelBinarizer()
df= df.join(pd.DataFrame(mlb.fit_transform(df['colors']),
                                      columns=mlb.classes_,
                                      index=df.index))
legend_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['legendary']), columns=['legendary'])
             .add_prefix('_'))
booster_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['booster']), columns=['booster'])
             .add_prefix('_'))
reserved_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['reserved']), columns=['resrved'])
             .add_prefix('_'))

df = df.drop(['legendary', 'booster', 'reserved', 'colors'], axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2.\nEquip {3} ({3}:...,['Equip'],0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0
1,4,['Creature'],"['Bird', 'Rogue']",Flying\nWhen Aarakocra Sneak enters the battle...,['Flying'],1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0
2,5,['Creature'],"['Astartes', 'Warrior']",Trample\nMark of Chaos Ascendant — During your...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0


##### Word 2 Vec Models

In [5]:
subtype_model = Word2Vec(sentences=df['sub_types'], vector_size=100, window=5, min_count=1, workers=4)
type_model = Word2Vec(sentences=df['types'], vector_size=100, window=5, min_count=1, workers=4)
keyword_model = Word2Vec(sentences=df['keywords'], vector_size=100, window=5, min_count=1, workers=4)

In [6]:
df['sub_types']=df['sub_types'].apply(lambda subtypes: np.sum([subtype_model.wv[word] for word in subtypes], axis=0))
df['types']=df['types'].apply(lambda types: np.sum([type_model.wv[word] for word in types], axis=0))
df['keywords']=df['keywords'].apply(lambda keywords: np.sum([keyword_model.wv[word] for word in keywords], axis=0))

#### Oracle text processing

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].tolist()

In [8]:
df['text_embedding'] = df['oracle_text'].apply(get_embeddings)
df.drop(columns='oracle_text', inplace=True)

In [9]:
def reshape_array(arr):
    return arr.reshape(768,)

In [10]:
df['text_embedding']=df['text_embedding'].apply(np.array).apply(reshape_array)
embedding_columns = ['types', 'sub_types', 'keywords', 'text_embedding']
df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)
df.drop(columns=embedding_columns, inplace=True)
df.head(3)

  df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)


Unnamed: 0,cmc,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved,combined_embedding
0,2,0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0,"[1.6690466403961182, -1.7796576023101807, -1.3..."
1,4,1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0,"[-1.5730993747711182, 2.6151950359344482, -2.9..."
2,5,5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0,"[-1.5730993747711182, 2.6151950359344482, -2.9..."


#### The Pipeline and neural network

In [11]:
X = df.drop(columns='usd')
y = df['usd']

In [12]:
numerical_features = ['power', 'toughness', 'released_at', 'cmc']
categorical_features = ['rarity']
remaining_features = X.drop(columns=['rarity', 'power', 'toughness', 'released_at', 'cmc'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

pipeline = make_pipeline(preprocessor)

In [13]:
X_transformed = pipeline.fit_transform(X)

In [14]:
column_names = pipeline.get_feature_names_out()
column_names

array(['num__power', 'num__toughness', 'num__released_at', 'num__cmc',
       'cat__rarity_common', 'cat__rarity_mythic', 'cat__rarity_rare',
       'cat__rarity_special', 'cat__rarity_uncommon', 'remainder__B',
       'remainder__C', 'remainder__G', 'remainder__N', 'remainder__R',
       'remainder__U', 'remainder__W', 'remainder___legendary',
       'remainder___booster', 'remainder___resrved',
       'remainder__combined_embedding'], dtype=object)

In [19]:
processed_df = pd.DataFrame(X_transformed, columns=column_names)
processed_df.head()

Unnamed: 0,num__power,num__toughness,num__released_at,num__cmc,cat__rarity_common,cat__rarity_mythic,cat__rarity_rare,cat__rarity_special,cat__rarity_uncommon,remainder__B,remainder__C,remainder__G,remainder__N,remainder__R,remainder__U,remainder__W,remainder___legendary,remainder___booster,remainder___resrved,remainder__combined_embedding
0,-0.787117,-0.817482,0.832342,-0.743662,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,1,0,"[1.6690466403961182, -1.7796576023101807, -1.3..."
1,-0.237207,1.302307,0.931314,0.397975,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,1,0,"[-1.5730993747711182, 2.6151950359344482, -2.9..."
2,1.962433,1.832254,0.967891,0.968794,0.0,1.0,0.0,0.0,0.0,1,0,0,0,1,1,0,1,0,0,"[-1.5730993747711182, 2.6151950359344482, -2.9..."
3,-0.787117,-0.817482,-1.391463,-1.885299,1.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,1,0,"[3.3952622413635254, -1.18755304813385, -0.136..."
4,-0.787117,-0.817482,0.690338,-0.172843,0.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,"[1.6690466403961182, -1.7796576023101807, -1.3..."


In [20]:
shapes = np.array([np.shape(item) for item in processed_df['remainder__combined_embedding']])
shapes[0]

array([1068])

In [21]:
X_numerical = processed_df.drop(columns='remainder__combined_embedding')
X_text =processed_df['remainder__combined_embedding']

In [22]:
X_numerical_train, X_numerical_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_numerical, X_text, y, test_size=0.30, random_state=42

)

In [23]:
X_text_train[0].shape

(1068,)

In [25]:
X_numerical_train.shape

(17881, 19)

In [27]:
def initialize_model(num_numerical_features, text_feature_shape):
    metrics = [metrics.MeanSquaredError(name='mse')]

    numerical_input = layers.Input(shape=(num_numerical_features,), name='numerical_input')
    x_numerical = layers.Dense(64, activation='relu')(numerical_input)
    x_numerical = layers.Dense(32, activation='relu')(x_numerical)

    text_input = layers.Input(shape=(text_feature_shape,), name='text_input')
    x_text = layers.Dense(32, activation='relu')(text_input)

    concatenated = layers.concatenate([x_numerical, x_text])

    x = layers.Dense(32, activation='relu')(concatenated)

    output = layers.Dense(1, activation='relu')(x)

    model = models.Model(inputs=[numerical_input, text_input], outputs=output)

    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=metrics)

    return model


In [None]:
def get_history(model):

        es = EarlyStopping(patience = 10, restore_best_weights=False)

        history = model.fit([X_numerical_train, X_text_train],
            y_train,
            epochs=100,
            batch_size=4,
            validation_split = 0.2,
            callbacks=[es],
            verbose=1)

        return history

In [None]:
model = initialize_model(19, 1068)

In [None]:
prediction = get_history(model)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).