In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import MeanSquaredError, Accuracy, MeanAbsoluteError
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Dense, Concatenate, Input, Dropout,  LSTM, Flatten, Embedding
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.regularizers import l1_l2
from keras.models import Sequential
import keras_nlp
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [68]:
df = pd.read_csv('../data/clean_cards.csv')
df.sample(5)

Unnamed: 0,cmc,colors,legendary,types,sub_types,oracle_text,keywords,power,toughness,rarity,reserved,released_at,booster,usd
5078,4,['B'],False,['Creature'],"['Tiefling', 'Cleric']","Skeletons, Vampires, and Zombies you control g...",['no keywords'],2,2,uncommon,False,10096,True,0.12
1246,1,['G'],False,['Sorcery'],['none'],"Search your library for a basic land card, rev...",['no keywords'],0,0,common,False,8339,True,0.05
22579,1,['C'],False,['Artifact'],['none'],"Whenever a player casts a black spell, you may...",['no keywords'],0,0,uncommon,False,3526,True,0.1
18182,6,"['G', 'R']",True,['Creature'],"['Ogre', 'Warrior']","Vigilance, reach Ruric Thar, the Unbowed attac...","['Reach', 'Vigilance']",6,6,rare,False,10446,True,0.25
19001,3,['W'],False,['Creature'],"['Elephant', 'Soldier']",5Green: Regenerate Selesnya Sentry.,['no keywords'],3,2,common,False,6883,True,0.01


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25545 entries, 0 to 25544
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cmc          25545 non-null  int64  
 1   colors       25545 non-null  object 
 2   legendary    25545 non-null  bool   
 3   types        25545 non-null  object 
 4   sub_types    25545 non-null  object 
 5   oracle_text  25523 non-null  object 
 6   keywords     25545 non-null  object 
 7   power        25545 non-null  int64  
 8   toughness    25545 non-null  int64  
 9   rarity       25545 non-null  object 
 10  reserved     25545 non-null  bool   
 11  released_at  25545 non-null  int64  
 12  booster      25545 non-null  bool   
 13  usd          25489 non-null  float64
dtypes: bool(3), float64(1), int64(4), object(6)
memory usage: 2.2+ MB


In [70]:
df = df[['oracle_text', 'usd']]
df.head(3)

Unnamed: 0,oracle_text,usd
0,Equipped creature gets +2/+2. Equip 3,0.02
1,Flying When Aarakocra Sneak enters the battlef...,0.06
2,Trample Mark of Chaos Ascendant — During your ...,2.81


In [71]:
df = df.dropna(subset=['oracle_text', 'usd'])
df = df.reset_index(drop=True)

In [72]:
df['oracle_text'].isna().value_counts(), df['usd'].isna().value_counts()

(oracle_text
 False    25476
 Name: count, dtype: int64,
 usd
 False    25476
 Name: count, dtype: int64)

In [73]:
X, y = df['oracle_text'], df['usd']

In [74]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
total_words = len(tokenizer.word_index) + 1

In [75]:
X = tokenizer.texts_to_sequences(X)

max_sequence_length = max([len(seq) for seq in X])
X = pad_sequences(X, maxlen=max_sequence_length, padding='pre')

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
def build_model():

    metrics = [Accuracy(), MeanAbsoluteError()]

    model = Sequential()
    model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_length))
    model.add(LSTM(100))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(16, activation='relu'))

    model.add(Dense(1, activation='linear'))

    model.compile(optimizer='adam', loss='mse', metrics=metrics)

    return model


In [78]:
model = build_model()

In [82]:
def get_history():
    es = EarlyStopping(patience=10, restore_best_weights=False)


    history = model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=512,
        validation_data=(X_test, y_test),
        callbacks=[es],
        verbose=1
    )

    return history

In [83]:
get_history()

Epoch 1/100

In [66]:
model.evaluate(X_test, np.array(y_test))



[4861.3427734375, 4.4798479080200195]