In [26]:
import pandas as pd
import numpy as np
import re
import os
from ast import literal_eval
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [27]:
pickle_location = r'/Users/louishagenbucher/Documents/GitHub/mtg_nlp_price_prediction/d:\temp\pickle_file'
df = pd.read_pickle(pickle_location)


In [28]:
df.head()

Unnamed: 0,name,cmc,B,C,G,N,R,U,W,legendary,...,rarity_rare,rarity_special,rarity_uncommon,released_at,booster,types_embedding,sub_type_embedding,keywords_embedding,text_embeddings,usd
0,+2 Mace,2.0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,2021-07-23,1,"[-0.007598841, 0.0013788732, -0.006743228, -0....","[-0.027093768, 0.054746334, -0.01797754, -0.07...","[-0.08516706, 0.13195656, -0.026486708, -0.013...","[[-0.43040189146995544, -0.4970492124557495, 0...",0.02
1,Aarakocra Sneak,4.0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,2022-06-10,1,"[-0.0017277271, 0.0008729596, 0.0071197986, 0....","[-0.08840044, 0.24980876, -0.044583265, -0.380...","[-0.20399831, 0.28901207, -0.048621196, -0.025...","[[-0.16684836149215698, -0.23930270969867706, ...",0.06
2,Abaddon the Despoiler,5.0,1,0,0,0,1,1,0,1,...,0.0,0.0,0.0,2022-10-07,0,"[-0.0017277271, 0.0008729596, 0.0071197986, 0....","[-0.090877846, 0.22575285, -0.036203165, -0.32...","[-0.16338894, 0.25493696, -0.046598416, -0.027...","[[-0.23658572137355804, -0.27686968445777893, ...",2.81
3,Abandoned Outpost,0.0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,2001-10-01,1,"[-0.010446079, 0.0029960414, 0.0013583362, -0....","[-0.00053622725, 0.00023643136, 0.0051033497, ...","[-0.00053622725, 0.00023643136, 0.0051033497, ...","[[-0.34767621755599976, -0.5046898722648621, -...",0.14
4,Abandoned Sarcophagus,3.0,0,1,0,0,0,0,0,0,...,1.0,0.0,0.0,2020-04-17,0,"[-0.007598841, 0.0013788732, -0.006743228, -0....","[-0.00053622725, 0.00023643136, 0.0051033497, ...","[-0.00053622725, 0.00023643136, 0.0051033497, ...","[[-0.037254445254802704, -0.20066097378730774,...",0.09


In [29]:
df['types_embedding'][0].shape

(100,)

In [30]:
df['sub_type_embedding'][0].shape

(100,)

In [31]:
df['keywords_embedding'][0].shape

(100,)

In [32]:
df['text_embeddings']= df['text_embeddings'].apply(np.array)

In [33]:
df['text_embeddings'][0].shape

(1, 768)

In [34]:
def reshape_array(arr):
    return arr.reshape(768,)

df['text_embeddings'] = df['text_embeddings'].apply(reshape_array)
df['text_embeddings'][0].shape

(768,)

In [35]:
embedding_columns = ['types_embedding', 'sub_type_embedding', 'keywords_embedding', 'text_embeddings']
df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)

  df['combined_embedding'] = df[embedding_columns].apply(np.concatenate, axis=1)


In [36]:
df.combined_embedding[0].shape

(1068,)

In [37]:
df.columns

Index(['name', 'cmc', 'B', 'C', 'G', 'N', 'R', 'U', 'W', 'legendary', 'power',
       'toughness', 'reserved', 'rarity_common', 'rarity_mythic',
       'rarity_rare', 'rarity_special', 'rarity_uncommon', 'released_at',
       'booster', 'types_embedding', 'sub_type_embedding',
       'keywords_embedding', 'text_embeddings', 'usd', 'combined_embedding'],
      dtype='object')

In [38]:
df = df[['name','B', 'C', 'G', 'N', 'R', 'U', 'W', 'legendary',
         'reserved', 'rarity_common', 'rarity_mythic',
       'rarity_rare', 'rarity_special', 'rarity_uncommon',
       'booster', 'cmc' ,'power', 'toughness',
       'combined_embedding', 'released_at', 'usd']]

In [39]:
df.sample(1)

Unnamed: 0,name,B,C,G,N,R,U,W,legendary,reserved,...,rarity_rare,rarity_special,rarity_uncommon,booster,cmc,power,toughness,combined_embedding,released_at,usd
5345,Desecrator Hag,1,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0,4.0,2,2,"[-0.0017277271253988147, 0.000872959615662694,...",2018-06-08,0.12


In [40]:
df['released_at'] = pd.to_datetime(df['released_at'])
reference_date = df['released_at'].min()
df['days_since_release'] = (df['released_at'] - reference_date).dt.days
df.drop(columns='released_at', inplace=True)

In [41]:
df.columns

Index(['name', 'B', 'C', 'G', 'N', 'R', 'U', 'W', 'legendary', 'reserved',
       'rarity_common', 'rarity_mythic', 'rarity_rare', 'rarity_special',
       'rarity_uncommon', 'booster', 'cmc', 'power', 'toughness',
       'combined_embedding', 'usd', 'days_since_release'],
      dtype='object')

In [42]:
df = df[['name', 'B', 'C', 'G', 'N', 'R', 'U', 'W', 'legendary', 'reserved',
       'rarity_common', 'rarity_mythic', 'rarity_rare', 'rarity_special',
       'rarity_uncommon', 'booster', 'cmc', 'power', 'toughness','days_since_release',
       'combined_embedding', 'usd']]
df.sample(1)

Unnamed: 0,name,B,C,G,N,R,U,W,legendary,reserved,...,rarity_rare,rarity_special,rarity_uncommon,booster,cmc,power,toughness,days_since_release,combined_embedding,usd
3153,Carom,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,1,2.0,0,0,4538,"[-0.009292987175285816, 0.00402201758697629, 0...",0.12


#### Fill X power and toughness with -1

In [43]:
df['power'] = pd.to_numeric(df['power'], errors='coerce')
df['toughness'] = pd.to_numeric(df['toughness'], errors='coerce')
index = df[df['power'].isna() | df['toughness'].isna()][['power', 'toughness']].index
df[df['power'].isna() | df['toughness'].isna()][['power', 'toughness']]

Unnamed: 0,power,toughness
24,,
27,,
129,,
143,,4.0
178,,
...,...,...
24860,,3.0
25008,2.0,
25130,,
25321,,


In [44]:
df['power'] = df['power'].fillna(-1).astype(int)
df['toughness'] = df['toughness'].fillna(-1).astype(int)

In [45]:
df.loc[index,['power', 'toughness']].head()

Unnamed: 0,power,toughness
24,-1,-1
27,-1,-1
129,-1,-1
143,-1,4
178,-1,-1


In [46]:
X = df.drop(columns=['name', 'usd'])
y = df.usd
numerical_features = ['cmc', 'power', 'toughness', 'days_since_release']
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
embedded_features = ['combined_embedding']
# to remember to preprocess when taking an input we would use ohe here
categorical_features = ['rarity_common', 'rarity_mythic', 'rarity_rare', 'rarity_special', 'rarity_uncommon', 'booster']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', 'passthrough', categorical_features),
        ('embed', 'passthrough', embedded_features),

    ])
X_transformed = preprocessor.fit_transform(X)

In [47]:
data_transformed = pd.DataFrame(X_transformed, columns=numerical_features + categorical_features + embedded_features)
data_transformed.head()

Unnamed: 0,cmc,power,toughness,days_since_release,rarity_common,rarity_mythic,rarity_rare,rarity_special,rarity_uncommon,booster,combined_embedding
0,-0.742891,-0.784331,-0.814794,0.835309,1.0,0.0,0.0,0.0,0.0,1.0,"[-0.007598841097205877, 0.0013788732467219234,..."
1,0.398713,-0.233859,1.30632,0.934263,1.0,0.0,0.0,0.0,0.0,1.0,"[-0.0017277271253988147, 0.000872959615662694,..."
2,0.969514,1.968027,1.836598,0.970833,0.0,1.0,0.0,0.0,0.0,0.0,"[-0.0017277271253988147, 0.000872959615662694,..."
3,-1.884494,-0.784331,-0.814794,-1.388084,1.0,0.0,0.0,0.0,0.0,1.0,"[-0.010446079075336456, 0.002996041439473629, ..."
4,-0.172089,-0.784331,-0.814794,0.693332,0.0,0.0,1.0,0.0,0.0,0.0,"[-0.007598841097205877, 0.0013788732467219234,..."


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)