In [203]:
import pandas as pd
import numpy as np
import re
import os
import torch
from ast import literal_eval
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import MeanSquaredError, Accuracy
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Concatenate
from keras.models import Sequential
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel


In [34]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
16437,2,['W'],False,['Enchantment'],['Aura'],Enchant creature\nAs Prismatic Ward enters the...,['Enchant'],0,0,common,False,1209,True,0.25
4577,4,['C'],False,"['Artifact', 'Creature']",['Construct'],Converge — Crystalline Crawler enters the batt...,['Converge'],1,1,rare,False,8955,False,6.96
11236,3,['C'],False,['Artifact'],['none'],"Whenever a creature you control dies, put two ...",['no keywords'],0,0,rare,False,7275,False,0.15
24350,4,['W'],False,['Creature'],"['Phyrexian', 'Fox']",Whenever one or more Phyrexians you control at...,['no keywords'],3,3,rare,False,10733,False,0.11
193,3,['W'],False,['Artifact'],['Vehicle'],"Flying\nWhenever Aerial Surveyor attacks, if d...","['Flying', 'Crew']",3,4,rare,False,10306,False,0.31


In [35]:
no_prices_df = df[df['usd'].isna()]

In [36]:
df.drop(df[df['usd'].isna()].index, inplace=True)
df.reset_index(drop=True, inplace=True)

#### Functions and models used for data processing

In [37]:
df['colors'] = df['colors'].apply(literal_eval)

In [38]:
mlb = MultiLabelBinarizer()
df= df.join(pd.DataFrame(mlb.fit_transform(df['colors']),
                                      columns=mlb.classes_,
                                      index=df.index))
legend_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['legendary']), columns=['legendary'])
             .add_prefix('_'))
booster_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['booster']), columns=['booster'])
             .add_prefix('_'))
reserved_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['reserved']), columns=['resrved'])
             .add_prefix('_'))

df = df.drop(['legendary', 'booster', 'reserved', 'colors'], axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2.\nEquip {3} ({3}:...,['Equip'],0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0
1,4,['Creature'],"['Bird', 'Rogue']",Flying\nWhen Aarakocra Sneak enters the battle...,['Flying'],1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0
2,5,['Creature'],"['Astartes', 'Warrior']",Trample\nMark of Chaos Ascendant — During your...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0


In [39]:
columns_to_array = ['B', 'C', 'G', 'N', 'R', 'U', 'W']
result_array = df[columns_to_array].values
df['color_array'] = result_array.tolist()
df = df.drop(columns=columns_to_array)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,usd,_legendary,_booster,_resrved,color_array
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2.\nEquip {3} ({3}:...,['Equip'],0,0,common,10096,0.02,0,1,0,"[0, 0, 0, 0, 0, 0, 1]"
1,4,['Creature'],"['Bird', 'Rogue']",Flying\nWhen Aarakocra Sneak enters the battle...,['Flying'],1,4,common,10418,0.06,0,1,0,"[0, 0, 0, 0, 0, 1, 0]"
2,5,['Creature'],"['Astartes', 'Warrior']",Trample\nMark of Chaos Ascendant — During your...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,2.81,1,0,0,"[1, 0, 0, 0, 1, 1, 0]"


##### Word 2 Vec Models

In [40]:
subtype_model = Word2Vec(sentences=df['sub_types'], vector_size=100, window=5, min_count=1, workers=4)
type_model = Word2Vec(sentences=df['types'], vector_size=100, window=5, min_count=1, workers=4)
keyword_model = Word2Vec(sentences=df['keywords'], vector_size=100, window=5, min_count=1, workers=4)

In [41]:
df['sub_types']=df['sub_types'].apply(lambda subtypes: np.sum([subtype_model.wv[word] for word in subtypes], axis=0))
df['types']=df['types'].apply(lambda types: np.sum([type_model.wv[word] for word in types], axis=0))
df['keywords']=df['keywords'].apply(lambda keywords: np.sum([keyword_model.wv[word] for word in keywords], axis=0))

In [177]:
len(subtype_model.wv.index_to_key), len(type_model.wv.index_to_key), len(keyword_model.wv.index_to_key)

(61, 34, 60)

#### Oracle text processing

In [42]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].tolist()

In [43]:
df['text_embedding'] = df['oracle_text'].apply(get_embeddings)
df.drop(columns='oracle_text', inplace=True)

In [175]:
vocab_size = tokenizer.vocab_size
vocab_size

30522

In [44]:
def reshape_array(arr):
    return arr.reshape(768,)

In [45]:
df['text_embedding']=df['text_embedding'].apply(np.array).apply(reshape_array)
embedding_columns = ['types', 'sub_types', 'keywords', 'text_embedding']
df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)
df.drop(columns=embedding_columns, inplace=True)
df.head(3)

  df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)


Unnamed: 0,cmc,power,toughness,rarity,released_at,usd,_legendary,_booster,_resrved,color_array,combined_embedding
0,2,0,0,common,10096,0.02,0,1,0,"[0, 0, 0, 0, 0, 0, 1]","[2.2472825050354004, -2.729654312133789, -0.41..."
1,4,1,4,common,10418,0.06,0,1,0,"[0, 0, 0, 0, 0, 1, 0]","[-1.462926983833313, 0.47497642040252686, -3.7..."
2,5,5,5,mythic,10537,2.81,1,0,0,"[1, 0, 0, 0, 1, 1, 0]","[-1.462926983833313, 0.47497642040252686, -3.7..."


In [46]:
df['combined_embedding'][0].shape

(1068,)

In [47]:
def reshape_embedding(arr):
    return arr.reshape(-1, 1)

In [48]:
df['combined_embedding'] = df['combined_embedding'].apply(reshape_embedding)
df['combined_embedding'][0].shape

(1068, 1)

In [49]:
df['combined_embedding'][0]

array([[ 2.24728251],
       [-2.72965431],
       [-0.41337436],
       ...,
       [-0.12849715],
       [ 0.0057382 ],
       [ 0.44860914]])

#### The Pipeline and neural network

In [50]:
df = df[['cmc', 'power', 'toughness', 'released_at',
       '_legendary', '_booster', '_resrved', 'rarity', 'color_array',
       'combined_embedding', 'usd']]
df.head(3)

Unnamed: 0,cmc,power,toughness,released_at,_legendary,_booster,_resrved,rarity,color_array,combined_embedding,usd
0,2,0,0,10096,0,1,0,common,"[0, 0, 0, 0, 0, 0, 1]","[[2.2472825050354004], [-2.729654312133789], [...",0.02
1,4,1,4,10418,0,1,0,common,"[0, 0, 0, 0, 0, 1, 0]","[[-1.462926983833313], [0.47497642040252686], ...",0.06
2,5,5,5,10537,1,0,0,mythic,"[1, 0, 0, 0, 1, 1, 0]","[[-1.462926983833313], [0.47497642040252686], ...",2.81


In [178]:
df

Unnamed: 0,cmc,power,toughness,released_at,_legendary,_booster,_resrved,rarity,color_array,combined_embedding,usd
0,2,0,0,10096,0,1,0,common,"[0, 0, 0, 0, 0, 0, 1]","[[2.2472825050354004], [-2.729654312133789], [...",0.02
1,4,1,4,10418,0,1,0,common,"[0, 0, 0, 0, 0, 1, 0]","[[-1.462926983833313], [0.47497642040252686], ...",0.06
2,5,5,5,10537,1,0,0,mythic,"[1, 0, 0, 0, 1, 1, 0]","[[-1.462926983833313], [0.47497642040252686], ...",2.81
3,0,0,0,2861,0,1,0,common,"[0, 0, 0, 1, 0, 0, 0]","[[3.1191296577453613], [-1.3161295652389526], ...",0.14
4,3,0,0,9634,0,0,0,rare,"[0, 1, 0, 0, 0, 0, 0]","[[2.2472825050354004], [-2.729654312133789], [...",0.09
...,...,...,...,...,...,...,...,...,...,...,...
25484,5,7,2,7604,1,1,0,mythic,"[1, 0, 0, 0, 1, 0, 1]","[[-1.462926983833313], [0.47497642040252686], ...",0.42
25485,4,0,0,4258,0,1,0,rare,"[0, 0, 0, 0, 0, 1, 0]","[[5.839674472808838], [-4.085742950439453], [-...",0.82
25486,4,1,4,10446,1,1,0,rare,"[1, 0, 0, 0, 0, 1, 1]","[[-1.462926983833313], [0.47497642040252686], ...",0.53
25487,3,2,3,10376,1,0,0,rare,"[0, 0, 0, 0, 1, 0, 0]","[[-1.462926983833313], [0.47497642040252686], ...",0.25


In [179]:
def reshape_embedding(arr):
    return arr.reshape(-1, 1)

In [180]:
X = df.drop(columns='usd')
y = df['usd']

In [181]:
numerical_features = ['power', 'toughness', 'released_at', 'cmc']
categorical_features = ['rarity']
remaining_features = X.drop(columns=['rarity', 'power', 'toughness', 'released_at', 'cmc'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

pipeline = make_pipeline(preprocessor)

In [182]:
X_preprocessed= pipeline.fit_transform(X)

In [183]:
column_names = pipeline.get_feature_names_out()
column_names

array(['num__power', 'num__toughness', 'num__released_at', 'num__cmc',
       'cat__rarity_common', 'cat__rarity_mythic', 'cat__rarity_rare',
       'cat__rarity_special', 'cat__rarity_uncommon',
       'remainder___legendary', 'remainder___booster',
       'remainder___resrved', 'remainder__color_array',
       'remainder__combined_embedding'], dtype=object)

In [184]:
processed_df = pd.DataFrame(X_preprocessed, columns=column_names)
processed_df.head()

Unnamed: 0,num__power,num__toughness,num__released_at,num__cmc,cat__rarity_common,cat__rarity_mythic,cat__rarity_rare,cat__rarity_special,cat__rarity_uncommon,remainder___legendary,remainder___booster,remainder___resrved,remainder__color_array,remainder__combined_embedding
0,-0.787786,-0.818064,0.83164,-0.74473,1.0,0.0,0.0,0.0,0.0,0,1,0,"[0, 0, 0, 0, 0, 0, 1]","[[2.2472825050354004], [-2.729654312133789], [..."
1,-0.237532,1.302998,0.930864,0.397826,1.0,0.0,0.0,0.0,0.0,0,1,0,"[0, 0, 0, 0, 0, 1, 0]","[[-1.462926983833313], [0.47497642040252686], ..."
2,1.963486,1.833263,0.967534,0.969104,0.0,1.0,0.0,0.0,0.0,1,0,0,"[1, 0, 0, 0, 1, 1, 0]","[[-1.462926983833313], [0.47497642040252686], ..."
3,-0.787786,-0.818064,-1.397816,-1.887285,1.0,0.0,0.0,0.0,0.0,0,1,0,"[0, 0, 0, 1, 0, 0, 0]","[[3.1191296577453613], [-1.3161295652389526], ..."
4,-0.787786,-0.818064,0.689275,-0.173452,0.0,0.0,1.0,0.0,0.0,0,0,0,"[0, 1, 0, 0, 0, 0, 0]","[[2.2472825050354004], [-2.729654312133789], [..."


In [185]:
columns_to_array = ['cat__rarity_common', 'cat__rarity_mythic', 'cat__rarity_rare',
'cat__rarity_special', 'cat__rarity_uncommon']
result_array = processed_df[columns_to_array].values
processed_df['rarity_array'] = result_array.tolist()
processed_df = processed_df.drop(columns=columns_to_array)
processed_df = processed_df[['num__power', 'num__toughness', 'num__released_at', 'num__cmc',
       'remainder___legendary', 'remainder___booster', 'remainder___resrved',
       'remainder__color_array', 'rarity_array', 'remainder__combined_embedding'
       ]]

In [186]:
processed_df.head(3)

Unnamed: 0,num__power,num__toughness,num__released_at,num__cmc,remainder___legendary,remainder___booster,remainder___resrved,remainder__color_array,rarity_array,remainder__combined_embedding
0,-0.787786,-0.818064,0.83164,-0.74473,0,1,0,"[0, 0, 0, 0, 0, 0, 1]","[1.0, 0.0, 0.0, 0.0, 0.0]","[[2.2472825050354004], [-2.729654312133789], [..."
1,-0.237532,1.302998,0.930864,0.397826,0,1,0,"[0, 0, 0, 0, 0, 1, 0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[[-1.462926983833313], [0.47497642040252686], ..."
2,1.963486,1.833263,0.967534,0.969104,1,0,0,"[1, 0, 0, 0, 1, 1, 0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[[-1.462926983833313], [0.47497642040252686], ..."


In [187]:
processed_df[['num__power', 'num__toughness', 'num__released_at', 'num__cmc']] = processed_df[['num__power', 'num__toughness', 'num__released_at', 'num__cmc']].astype(float)

In [188]:
processed_df[['remainder___legendary', 'remainder___booster', 'remainder___resrved',]] = processed_df[['remainder___legendary', 'remainder___booster', 'remainder___resrved',]].astype(int)

In [189]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25489 entries, 0 to 25488
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   num__power                     25489 non-null  float64
 1   num__toughness                 25489 non-null  float64
 2   num__released_at               25489 non-null  float64
 3   num__cmc                       25489 non-null  float64
 4   remainder___legendary          25489 non-null  int64  
 5   remainder___booster            25489 non-null  int64  
 6   remainder___resrved            25489 non-null  int64  
 7   remainder__color_array         25489 non-null  object 
 8   rarity_array                   25489 non-null  object 
 9   remainder__combined_embedding  25489 non-null  object 
dtypes: float64(4), int64(3), object(3)
memory usage: 1.9+ MB


In [190]:
X_numerical = processed_df[['num__power','num__toughness', 'num__released_at', 'num__cmc']]
X_color = processed_df[['remainder__color_array']]
X_rarity = processed_df[['rarity_array']]
X_boolean= processed_df[['remainder___legendary', 'remainder___booster', 'remainder___resrved']]
X_text = processed_df['remainder__combined_embedding']

In [191]:
X_numerical.shape, X_color.shape, X_rarity.shape, X_boolean.shape, X_text.shape

((25489, 4), (25489, 1), (25489, 1), (25489, 3), (25489,))

In [None]:
reshape(X_text)

In [192]:
X_numerical_train, X_numerical_test, X_color_train, X_color_test, X_rarity_train, X_rarity_test, X_boolean_train, X_boolean_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_numerical, X_color, X_rarity, X_boolean, X_text, y, test_size=0.30, random_state=42
)

In [193]:
X_text_train = np.array(X_text_train.tolist())
X_text_test = np.array(X_text_test.tolist())
X_color_train= np.array(X_text_test.tolist())
X_color_test= np.array(X_text_test.tolist())
X_rarity_train= np.array(X_text_test.tolist())
X_rarity_test= np.array(X_text_test.tolist())

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

#### Test Text Model

In [194]:
model = Sequential()

model.add(layers.Dense(64, activation='relu', input_shape=(1068,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 64)                68416     
                                                                 
 dense_19 (Dense)            (None, 32)                2080      
                                                                 
 dense_20 (Dense)            (None, 1)                 33        
                                                                 
Total params: 70529 (275.50 KB)
Trainable params: 70529 (275.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [195]:
model.fit(X_text_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2d974c5e0>

In [196]:
loss, accuracy = model.evaluate(X_text_test, y_test)



In [206]:
def initialize_model():
    metrics_list = [MeanSquaredError(name='mse'), Accuracy(name='accuracy')]

    model = models.Sequential()

    model.add(layers.InputLayer(input_shape=(4,), name='input_numerical'))
    model.add(layers.Dense(64, activation='relu'))

    model.add(layers.InputLayer(input_shape=(1068,), name='Input_text'))
    model.add(layers.Dense(64, activation='relu'))

    model.add(layers.InputLayer(input_shape=(3,), name='input_boolean'))
    model.add(layers.Dense(16, activation='relu'))

    model.add(layers.InputLayer(input_shape=(1,), name='input_color'))
    model.add(layers.Dense(32, activation='relu'))

    model.add(layers.InputLayer(input_shape=(1,), name='input_rarity'))
    model.add(layers.Dense(32, activation='relu'))

    model.add(layers.Concatenate())

    model.add(layers.Dense(1, activation='linear'))

    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=metrics_list)


In [207]:
model = initialize_model()

ValueError: A `Concatenate` layer should be called on a list of at least 1 input. Received: input_shape=(None, 32)

In [198]:
def get_history():
    es = EarlyStopping(patience=10, restore_best_weights=False)

    history = model.fit(
        [X_numerical_train,  X_color_train, X_rarity_train, X_boolean_train, X_text_train],
        y_train,
        epochs=100,
        batch_size=4,
        validation_split=0.2,
        callbacks=[es],
        verbose=1
    )

    return history


In [199]:
model = initialize_model()
history = get_history()

UnboundLocalError: local variable 'dense_text' referenced before assignment