In [212]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from ast import literal_eval
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import MeanSquaredError, Accuracy
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Concatenate
from keras.models import Sequential
from transformers import DistilBertTokenizer, DistilBertModel


In [213]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
21412,1,['C'],False,['Artifact'],['none'],"{2}, {T}, Sacrifice Sungrass Egg: Add {G}{W}. ...",['no keywords'],0,0,uncommon,False,2861,True,0.25
24292,2,['W'],False,['Creature'],"['Human', 'Soldier']","Banding (Any creatures with banding, and up to...","['Banding', 'Cumulative upkeep']",2,4,uncommon,False,1286,True,0.14
13647,2,['G'],False,['Instant'],['none'],Kicker {2}{G} (You may pay an additional {2}{G...,['Kicker'],0,0,common,False,9795,True,0.01
13006,4,"['G', 'R', 'U']",True,['Creature'],"['Human', 'Tyranid', 'Wizard']",Spiritual Leader — At the beginning of combat ...,"['Spiritual Leader', 'Psychic Stimulus']",1,1,mythic,False,10537,False,3.07
9284,3,['B'],False,['Creature'],"['Human', 'Rogue']","{B}, {T}: Exile target artifact card from a gr...",['no keywords'],1,1,rare,True,243,True,7.63


In [214]:
no_prices_df = df[df['usd'].isna()]

In [215]:
df.drop(df[df['usd'].isna()].index, inplace=True)
df.reset_index(drop=True, inplace=True)

#### Functions and models used for data processing

In [216]:
df['colors'] = df['colors'].apply(literal_eval)

In [217]:
mlb = MultiLabelBinarizer()
df= df.join(pd.DataFrame(mlb.fit_transform(df['colors']),
                                      columns=mlb.classes_,
                                      index=df.index))
legend_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['legendary']), columns=['legendary'])
             .add_prefix('_'))
booster_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['booster']), columns=['booster'])
             .add_prefix('_'))
reserved_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['reserved']), columns=['resrved'])
             .add_prefix('_'))

df = df.drop(['legendary', 'booster', 'reserved', 'colors'], axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2.\nEquip {3} ({3}:...,['Equip'],0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0
1,4,['Creature'],"['Bird', 'Rogue']",Flying\nWhen Aarakocra Sneak enters the battle...,['Flying'],1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0
2,5,['Creature'],"['Astartes', 'Warrior']",Trample\nMark of Chaos Ascendant — During your...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0


#### Text processing

In [218]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].tolist()

In [219]:
df['sub_types_embedding']=df['sub_types'].apply(get_embeddings)

In [220]:
df['types_embedding']=df['types'].apply(get_embeddings)

In [221]:
df['keywords_embedding']=df['keywords'].apply(get_embeddings)

In [222]:
df['text_embedding'] = df['oracle_text'].apply(get_embeddings)

In [223]:
df.drop(columns=['sub_types','types', 'keywords', 'oracle_text'],  inplace=True)

In [224]:
df.head(3)

Unnamed: 0,cmc,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved,sub_types_embedding,types_embedding,keywords_embedding,text_embedding
0,2,0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0,"[[0.048333920538425446, 0.09606622159481049, -...","[[-0.05527622252702713, 0.07099153101444244, -...","[[-0.039147913455963135, 0.0037941206246614456...","[[-0.36550939083099365, -0.362487256526947, -0..."
1,4,1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0,"[[-0.007577039301395416, -0.013978387229144573...","[[-0.004934143275022507, 0.021525928750634193,...","[[-0.011417698115110397, 0.05888311564922333, ...","[[-0.16684836149215698, -0.23930270969867706, ..."
2,5,5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0,"[[-0.05285697057843208, 0.10382992774248123, -...","[[-0.004934143275022507, 0.021525928750634193,...","[[0.08564460277557373, 0.04099284112453461, -0...","[[-0.23658572137355804, -0.27686968445777893, ..."


#### The Pipeline and neural network

In [358]:
df

Unnamed: 0,cmc,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved,sub_types_embedding,types_embedding,keywords_embedding,text_embedding
0,2,0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0,"[[0.048333920538425446, 0.09606622159481049, -...","[[-0.05527622252702713, 0.07099153101444244, -...","[[-0.039147913455963135, 0.0037941206246614456...","[[-0.36550939083099365, -0.362487256526947, -0..."
1,4,1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0,"[[-0.007577039301395416, -0.013978387229144573...","[[-0.004934143275022507, 0.021525928750634193,...","[[-0.011417698115110397, 0.05888311564922333, ...","[[-0.16684836149215698, -0.23930270969867706, ..."
2,5,5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0,"[[-0.05285697057843208, 0.10382992774248123, -...","[[-0.004934143275022507, 0.021525928750634193,...","[[0.08564460277557373, 0.04099284112453461, -0...","[[-0.23658572137355804, -0.27686968445777893, ..."
3,0,0,0,common,2861,0.14,0,0,0,1,0,0,0,0,1,0,"[[-0.03981915861368179, -0.03859994560480118, ...","[[0.03863392025232315, -0.0012220675125718117,...","[[-0.0466962531208992, -0.020105689764022827, ...","[[-0.3683808147907257, -0.36620354652404785, -..."
4,3,0,0,rare,9634,0.09,0,1,0,0,0,0,0,0,0,0,"[[-0.03981915861368179, -0.03859994560480118, ...","[[-0.05527622252702713, 0.07099153101444244, -...","[[-0.0466962531208992, -0.020105689764022827, ...","[[-0.037254445254802704, -0.20066097378730774,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25484,5,7,2,mythic,7604,0.42,1,0,0,0,1,0,1,1,1,0,"[[-0.1253373622894287, 0.10831902921199799, -0...","[[-0.004934143275022507, 0.021525928750634193,...","[[-0.09200353920459747, 0.007305992767214775, ...","[[-0.2274055927991867, -0.26141682267189026, -..."
25485,4,0,0,rare,4258,0.82,0,0,0,0,0,1,0,0,1,0,"[[-0.03981915861368179, -0.03859994560480118, ...","[[0.00986480712890625, 0.047369033098220825, -...","[[-0.0466962531208992, -0.020105689764022827, ...","[[-0.33752167224884033, -0.2600155174732208, 0..."
25486,4,1,4,rare,10446,0.53,1,0,0,0,0,1,1,1,1,0,"[[-0.02251829020678997, 0.07062771171331406, -...","[[-0.004934143275022507, 0.021525928750634193,...","[[-0.011417698115110397, 0.05888311564922333, ...","[[-0.2219531089067459, -0.318474143743515, 0.0..."
25487,3,2,3,rare,10376,0.25,0,0,0,0,1,0,0,1,0,0,"[[-0.022037478163838387, 0.062315717339515686,...","[[-0.004934143275022507, 0.021525928750634193,...","[[-0.0466962531208992, -0.020105689764022827, ...","[[-0.3002524673938751, -0.3068389892578125, -0..."


In [359]:
X = df.drop(columns='usd')
y = df['usd']

In [360]:
numerical_features = ['power', 'toughness', 'released_at', 'cmc']
categorical_features = ['rarity']
remaining_features = X.drop(columns=['rarity', 'power', 'toughness', 'released_at', 'cmc'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

pipeline = make_pipeline(preprocessor)

In [361]:
X_preprocessed= pipeline.fit_transform(X)

In [362]:
column_names = pipeline.get_feature_names_out()
column_names

array(['num__power', 'num__toughness', 'num__released_at', 'num__cmc',
       'cat__rarity_common', 'cat__rarity_mythic', 'cat__rarity_rare',
       'cat__rarity_special', 'cat__rarity_uncommon', 'remainder__B',
       'remainder__C', 'remainder__G', 'remainder__N', 'remainder__R',
       'remainder__U', 'remainder__W', 'remainder___legendary',
       'remainder___booster', 'remainder___resrved',
       'remainder__sub_types_embedding', 'remainder__types_embedding',
       'remainder__keywords_embedding', 'remainder__text_embedding'],
      dtype=object)

In [363]:
processed_df = pd.DataFrame(X_preprocessed, columns=column_names)

In [364]:
processed_df.head(3)

Unnamed: 0,num__power,num__toughness,num__released_at,num__cmc,cat__rarity_common,cat__rarity_mythic,cat__rarity_rare,cat__rarity_special,cat__rarity_uncommon,remainder__B,...,remainder__R,remainder__U,remainder__W,remainder___legendary,remainder___booster,remainder___resrved,remainder__sub_types_embedding,remainder__types_embedding,remainder__keywords_embedding,remainder__text_embedding
0,-0.787786,-0.818064,0.83164,-0.74473,1.0,0.0,0.0,0.0,0.0,0,...,0,0,1,0,1,0,"[[0.048333920538425446, 0.09606622159481049, -...","[[-0.05527622252702713, 0.07099153101444244, -...","[[-0.039147913455963135, 0.0037941206246614456...","[[-0.36550939083099365, -0.362487256526947, -0..."
1,-0.237532,1.302998,0.930864,0.397826,1.0,0.0,0.0,0.0,0.0,0,...,0,1,0,0,1,0,"[[-0.007577039301395416, -0.013978387229144573...","[[-0.004934143275022507, 0.021525928750634193,...","[[-0.011417698115110397, 0.05888311564922333, ...","[[-0.16684836149215698, -0.23930270969867706, ..."
2,1.963486,1.833263,0.967534,0.969104,0.0,1.0,0.0,0.0,0.0,1,...,1,1,0,1,0,0,"[[-0.05285697057843208, 0.10382992774248123, -...","[[-0.004934143275022507, 0.021525928750634193,...","[[0.08564460277557373, 0.04099284112453461, -0...","[[-0.23658572137355804, -0.27686968445777893, ..."


In [365]:
processed_df =processed_df.astype({
    'num__power': float,
    'num__toughness': float,
    'num__released_at': float,
    'num__cmc': float
})

In [366]:
processed_df = processed_df.astype({
    "cat__rarity_common": int,
    "cat__rarity_mythic": int,
    "cat__rarity_rare": int,
    "cat__rarity_special": int,
    "cat__rarity_uncommon": int,
    "remainder__B": int,
    "remainder__C": int,
    "remainder__G": int,
    "remainder__N": int,
    "remainder__R": int,
    "remainder__U": int,
    "remainder__W": int,
    "remainder___legendary": int,
    "remainder___booster": int,
    "remainder___resrved": int
})

In [367]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25489 entries, 0 to 25488
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   num__power                      25489 non-null  float64
 1   num__toughness                  25489 non-null  float64
 2   num__released_at                25489 non-null  float64
 3   num__cmc                        25489 non-null  float64
 4   cat__rarity_common              25489 non-null  int64  
 5   cat__rarity_mythic              25489 non-null  int64  
 6   cat__rarity_rare                25489 non-null  int64  
 7   cat__rarity_special             25489 non-null  int64  
 8   cat__rarity_uncommon            25489 non-null  int64  
 9   remainder__B                    25489 non-null  int64  
 10  remainder__C                    25489 non-null  int64  
 11  remainder__G                    25489 non-null  int64  
 12  remainder__N                    

In [368]:
X_numerical = processed_df[['num__power', 'num__toughness', 'num__released_at', 'num__cmc']]
X_categorical = processed_df[['remainder___legendary', 'remainder___booster', 'remainder___resrved',
                              'cat__rarity_common', 'cat__rarity_mythic', 'cat__rarity_rare', 'cat__rarity_special', 'cat__rarity_uncommon',
                              'remainder__B', 'remainder__C', 'remainder__G', 'remainder__N', 'remainder__R', 'remainder__U', 'remainder__W']]
X_text = processed_df[['remainder__text_embedding']]
X_subtypes= processed_df[['remainder__sub_types_embedding']]
X_types = processed_df[['remainder__types_embedding']]
X_keys = processed_df[['remainder__keywords_embedding']]

In [369]:
X_numerical.shape, X_categorical.shape, X_text.shape, X_subtypes.shape, X_types.shape, X_keys.shape

((25489, 4), (25489, 15), (25489, 1), (25489, 1), (25489, 1), (25489, 1))

In [370]:
X_text['remainder__text_embedding']=X_text['remainder__text_embedding'].apply(np.array)
X_subtypes['remainder__sub_types_embedding']=X_subtypes['remainder__sub_types_embedding'].apply(np.array)
X_types['remainder__types_embedding']=X_types['remainder__types_embedding'].apply(np.array)
X_keys['remainder__keywords_embedding']=X_keys['remainder__keywords_embedding'].apply(np.array)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_text['remainder__text_embedding']=X_text['remainder__text_embedding'].apply(np.array)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_subtypes['remainder__sub_types_embedding']=X_subtypes['remainder__sub_types_embedding'].apply(np.array)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_types['re

In [371]:
np.unique(X_text['remainder__text_embedding'].apply(lambda x: x.shape)), np.unique(X_subtypes['remainder__sub_types_embedding'].apply(lambda x: x.shape)), np.unique(X_types['remainder__types_embedding'].apply(lambda x: x.shape)), np.unique(X_keys['remainder__keywords_embedding'].apply(lambda x: x.shape))


(array([(1, 768)], dtype=object),
 array([(1, 768)], dtype=object),
 array([(1, 768)], dtype=object),
 array([(1, 768)], dtype=object))

In [372]:
X_text_embeddings = np.concatenate(X_text['remainder__text_embedding'].values).reshape(-1, 768)
X_subtypes_embedding = np.concatenate(X_subtypes['remainder__sub_types_embedding'].values).reshape(-1, 768)
X_types_embedding = np.concatenate(X_types['remainder__types_embedding'].values).reshape(-1, 768)
X_keys_embedding = np.concatenate(X_keys['remainder__keywords_embedding'].values).reshape(-1, 768)

In [373]:
X_text_embeddings[0].shape

(768,)

In [374]:
X_numerical_train, X_numerical_test, X_categorical_train, X_categorical_test, X_text_train, X_text_test,X_subtypes_train, X_subtypes_test, X_types_train, X_types_test, X_keys_train, X_keys_test, y_train, y_test = train_test_split(
    X_numerical, X_categorical, X_text_embeddings, X_subtypes_embedding, X_types_embedding, X_keys_embedding, y, test_size=0.30, random_state=42
)

In [375]:
def initialize_model():
    metrics_list = [MeanSquaredError(name='mse'), Accuracy(name='accuracy')]

    # Numerical input 4 columns
    input_numerical = layers.Input(shape=(4,), name='input_numerical')
    dense_numerical = layers.Dense(32, activation='relu')(input_numerical)

    # Categorical input 15 columns
    input_categorical = layers.Input(shape=(15,), name='input_categorical')
    dense_categorical = layers.Dense(64, activation='relu')(input_categorical)

    # Original Text input
    input_text = layers.Input(shape=(768,), name='input_text')
    dense_text = layers.Dense(64, activation='relu')(input_text)

    # Text input for 'type'
    input_type = layers.Input(shape=(768,), name='input_type')
    dense_type = layers.Dense(64, activation='relu')(input_type)

    # Text input for 'subtype'
    input_subtype = layers.Input(shape=(768,), name='input_subtype')
    dense_subtype = layers.Dense(64, activation='relu')(input_subtype)

    input_keys = layers.Input(shape=(768,), name='input_keys')
    dense_keys = layers.Dense(64, activation='relu')(input_keys)

    concatenated = layers.Concatenate()([dense_numerical, dense_categorical, dense_text, dense_type, dense_keys, dense_subtype])

    output = layers.Dense(1, activation='linear')(concatenated)

    model = models.Model(inputs=[input_numerical, input_categorical, input_text, input_type, input_keys, input_subtype], outputs=output)

    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=metrics_list)

    return model


In [376]:
def get_history():
    es = EarlyStopping(patience=10, restore_best_weights=False)

    train_data = {
        'input_numerical': X_numerical_train,
        'input_categorical': X_categorical_train,
        'input_text': X_text_train,
        'input_type': X_types_train,
        'input_subtype': X_subtypes_train,
        'input_keys': X_keys_train
    }

    test_data = {
        'input_numerical': X_numerical_test,
        'input_categorical': X_categorical_test,
        'input_text': X_text_test,
        'input_type': X_types_test,
        'input_subtype': X_subtypes_test,
        'input_keys': X_keys_test
    }

    history = model.fit(
        train_data,
        y_train,
        epochs=20,
        batch_size=4,
        validation_data=(test_data, y_test),
        callbacks=[es],
        verbose=1
    )

    return history


In [377]:
model = initialize_model()

In [378]:
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_numerical (InputLaye  [(None, 4)]                  0         []                            
 r)                                                                                               
                                                                                                  
 input_categorical (InputLa  [(None, 15)]                 0         []                            
 yer)                                                                                             
                                                                                                  
 input_text (InputLayer)     [(None, 768)]                0         []                            
                                                                                            

In [379]:
get_history()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x3fe2bfa00>

In [380]:
test_data = {
        'input_numerical': X_numerical_test,
        'input_categorical': X_categorical_test,
        'input_text': X_text_test,
        'input_type': X_types_test,
        'input_subtype': X_subtypes_test,
        'input_keys': X_keys_test
    }
model.evaluate(test_data, y_test)



[1899.6484375, 1899.6484375, 0.0]