In [1]:
import pandas as pd
import numpy as np
import re
import os
import torch
from ast import literal_eval
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras import models, layers
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, DistilBertModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
1666,4,['R'],False,['Creature'],"['Giant', 'Wizard']","When Basalt Ravager enters the battlefield, it...",['no keywords'],4,2,uncommon,False,9928,True,0.08
21020,2,['U'],False,['Creature'],['Licid'],"{1}{U}, {T}: Stinging Licid loses this ability...",['no keywords'],1,1,uncommon,False,1413,True,0.18
6986,5,['U'],False,['Sorcery'],['none'],Converge — Gain control of target creature if ...,['Converge'],0,0,rare,False,7975,True,0.07
16052,6,['R'],False,['Sorcery'],['none'],Pinnacle of Rage deals 3 damage to each of two...,['no keywords'],0,0,uncommon,False,7373,True,0.02
14339,8,['R'],True,['Creature'],['Spirit'],Myojin of Roaring Blades enters the battlefiel...,['no keywords'],7,4,rare,False,10306,False,0.09


In [3]:
no_prices_df = df[df['usd'].isna()]

In [4]:
df.drop(df[df['usd'].isna()].index, inplace=True)
df.reset_index(drop=True, inplace=True)

#### Functions and models used for data processing

In [5]:
df['colors'] = df['colors'].apply(literal_eval)

In [6]:
mlb = MultiLabelBinarizer()
df= df.join(pd.DataFrame(mlb.fit_transform(df['colors']),
                                      columns=mlb.classes_,
                                      index=df.index))
legend_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['legendary']), columns=['legendary'])
             .add_prefix('_'))
booster_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['booster']), columns=['booster'])
             .add_prefix('_'))
reserved_lb= LabelBinarizer()
df = df.join(pd.DataFrame(legend_lb.fit_transform(df['reserved']), columns=['resrved'])
             .add_prefix('_'))

df = df.drop(['legendary', 'booster', 'reserved', 'colors'], axis=1)
df.head(3)

Unnamed: 0,cmc,types,sub_types,oracle_text,keywords,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved
0,2,['Artifact'],['Equipment'],Equipped creature gets +2/+2.\nEquip {3} ({3}:...,['Equip'],0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0
1,4,['Creature'],"['Bird', 'Rogue']",Flying\nWhen Aarakocra Sneak enters the battle...,['Flying'],1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0
2,5,['Creature'],"['Astartes', 'Warrior']",Trample\nMark of Chaos Ascendant — During your...,"['Mark of Chaos Ascendant', 'Trample']",5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0


##### Word 2 Vec Models

In [7]:
subtype_model = Word2Vec(sentences=df['sub_types'], vector_size=100, window=5, min_count=1, workers=4)
type_model = Word2Vec(sentences=df['types'], vector_size=100, window=5, min_count=1, workers=4)
keyword_model = Word2Vec(sentences=df['keywords'], vector_size=100, window=5, min_count=1, workers=4)

In [8]:
df['sub_types']=df['sub_types'].apply(lambda subtypes: np.sum([subtype_model.wv[word] for word in subtypes], axis=0))
df['types']=df['types'].apply(lambda types: np.sum([type_model.wv[word] for word in types], axis=0))
df['keywords']=df['keywords'].apply(lambda keywords: np.sum([keyword_model.wv[word] for word in keywords], axis=0))

#### Oracle text processing

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].tolist()

In [10]:
df['text_embedding'] = df['oracle_text'].apply(get_embeddings)
df.drop(columns='oracle_text', inplace=True)

In [11]:
def reshape_array(arr):
    return arr.reshape(768,)

In [12]:
df['text_embedding']=df['text_embedding'].apply(np.array).apply(reshape_array)
embedding_columns = ['types', 'sub_types', 'keywords', 'text_embedding']
df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)
df.drop(columns=embedding_columns, inplace=True)
df.head(3)

  df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)


Unnamed: 0,cmc,power,toughness,rarity,released_at,usd,B,C,G,N,R,U,W,_legendary,_booster,_resrved,combined_embedding
0,2,0,0,common,10096,0.02,0,0,0,0,0,0,1,0,1,0,"[2.7962656021118164, 1.8265407085418701, -0.16..."
1,4,1,4,common,10418,0.06,0,0,0,0,0,1,0,0,1,0,"[-2.0402467250823975, 3.7331454753875732, -2.9..."
2,5,5,5,mythic,10537,2.81,1,0,0,0,1,1,0,1,0,0,"[-2.0402467250823975, 3.7331454753875732, -2.9..."


#### The Pipeline and neural network

In [13]:
X = df.drop(columns='usd')
y = df['usd']

In [14]:
numerical_features = ['power', 'toughness', 'released_at', 'cmc']
categorical_features = ['rarity']
remaining_features = X.drop(columns=['rarity', 'power', 'toughness', 'released_at', 'cmc'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

pipeline = make_pipeline(preprocessor)

In [15]:
X_transformed = pipeline.fit_transform(X)

In [16]:
column_names = pipeline.get_feature_names_out()
column_names

array(['num__power', 'num__toughness', 'num__released_at', 'num__cmc',
       'cat__rarity_common', 'cat__rarity_mythic', 'cat__rarity_rare',
       'cat__rarity_special', 'cat__rarity_uncommon', 'remainder__B',
       'remainder__C', 'remainder__G', 'remainder__N', 'remainder__R',
       'remainder__U', 'remainder__W', 'remainder___legendary',
       'remainder___booster', 'remainder___resrved',
       'remainder__combined_embedding'], dtype=object)

In [17]:
processed_df = pd.DataFrame(X_transformed, columns=column_names)
processed_df.head()

Unnamed: 0,num__power,num__toughness,num__released_at,num__cmc,cat__rarity_common,cat__rarity_mythic,cat__rarity_rare,cat__rarity_special,cat__rarity_uncommon,remainder__B,remainder__C,remainder__G,remainder__N,remainder__R,remainder__U,remainder__W,remainder___legendary,remainder___booster,remainder___resrved,remainder__combined_embedding
0,-0.787786,-0.818064,0.83164,-0.74473,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,1,0,"[2.7962656021118164, 1.8265407085418701, -0.16..."
1,-0.237532,1.302998,0.930864,0.397826,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,1,0,"[-2.0402467250823975, 3.7331454753875732, -2.9..."
2,1.963486,1.833263,0.967534,0.969104,0.0,1.0,0.0,0.0,0.0,1,0,0,0,1,1,0,1,0,0,"[-2.0402467250823975, 3.7331454753875732, -2.9..."
3,-0.787786,-0.818064,-1.397816,-1.887285,1.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,1,0,"[3.1340062618255615, 1.0089235305786133, 0.303..."
4,-0.787786,-0.818064,0.689275,-0.173452,0.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,"[2.7962656021118164, 1.8265407085418701, -0.16..."


In [18]:
shapes = np.array([np.shape(item) for item in processed_df['remainder__combined_embedding']])
shapes[0]

array([1068])

In [20]:
X_numerical = processed_df.drop(columns='remainder__combined_embedding')
X_text = processed_df['remainder__combined_embedding']

In [22]:
X_numerical = np.asarray(X_numerical).astype(np.float32)

In [23]:
X_numerical_train, X_numerical_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_numerical, X_text, y, test_size=0.30, random_state=42

)

In [24]:
def initialize_model():
    metrics_list = [MeanSquaredError(name='mse')]

    input_numerical = layers.Input(shape=(X_numerical_train.shape[1],))
    dense1 = layers.Dense(32, activation='relu')(input_numerical)

    input_text = layers.Input(shape=(1068,))
    dense2 = layers.Dense(15, activation='relu')(input_text)

    # Concatenate the branches
    concatenated = layers.concatenate([dense1, dense2])

    # Output layer
    output = layers.Dense(1, activation='relu')(concatenated)

    # Build the model
    model = models.Model(inputs=[input_numerical, input_text], outputs=output)

    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=metrics_list)

    return model


In [25]:
def get_history(model, X_numerical_train, X_text_train, y_train):
    es = EarlyStopping(patience=10, restore_best_weights=False)

    history = model.fit(
        [X_numerical_train, X_text_train],
        y_train,
        epochs=100,
        batch_size=4,
        validation_split=0.2,
        callbacks=[es],
        verbose=1
    )

    return history


In [26]:
model = initialize_model()
history = get_history(model, X_numerical_train, X_text_train, y_train)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).