In [1]:
import pandas as pd
import re
import pickle
import spacy
from spacy import displacy
import nltk
from nltk.util import ngrams
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load dataset

card_text = pd.read_json('oracle-cards-20220819210331.json')

In [3]:
# drop 69 unnecessary columns

drop_cols = ['object', 'id', 'oracle_id', 'multiverse_ids', 'mtgo_id', 'mtgo_foil_id', 'tcgplayer_id','cardmarket_id', 'lang', 
             'released_at', 'uri', 'scryfall_uri', 'layout', 'highres_image', 'image_status', 'image_uris', 'colors',
             'legalities', 'games', 'reserved', 'foil', 'nonfoil', 'finishes', 'oversized', 'promo', 'reprint', 'variation',
             'set_id', 'set', 'set_uri', 'set_search_uri', 'scryfall_set_uri', 'rulings_uri', 'prints_search_uri',
             'collector_number', 'digital', 'card_back_id', 'artist', 'artist_ids', 'illustration_id', 'border_color', 'frame',
             'full_art', 'textless', 'booster', 'story_spotlight', 'prices', 'related_uris', 'security_stamp', 'preview',
             'power', 'toughness', 'penny_rank', 'arena_id', 'watermark', 'produced_mana', 'all_parts', 'card_faces',
             'frame_effects', 'tcgplayer_etched_id', 'promo_types', 'loyalty', 'life_modifier', 'hand_modifier',
             'color_indicator', 'content_warning', 'cmc', 'keywords', 'flavor_text', 'edhrec_rank']

card_text = card_text.drop(columns = drop_cols)

In [4]:
# clean

# fill NaNs
card_text['oracle_text'] = card_text['oracle_text'].fillna('None')

# drop useless rows
vangard_mask = ((card_text['set_type'] == 'vanguard') |
                (card_text['type_line'] == 'Vanguard') |
                (card_text['set_name'].str.contains('Vanguard')))
card_text.drop(card_text[vangard_mask].index, inplace = True)
card_text.drop(card_text[card_text['type_line'].str.contains('Emblem')].index, inplace = True)
card_text.reset_index(drop = True, inplace = True)

# emliminate extraneous characters
unwanted_text = "[.,!?()•—\\:;|]"
card_text['oracle_text'] = card_text['oracle_text'].map(lambda x: re.sub(unwanted_text, '', x))

# remove all references of a card's name from it's rules text
card_text['Rules_Text'] = card_text[['name', 'oracle_text']].apply(lambda x: 
                                                                     x['oracle_text'].replace(x['name'], 'This_Card')
                                                                     if x['name'] in x['oracle_text']
                                                                     else x['oracle_text'],
                                                                     axis = 1)

# set rules_text to lower case
card_text['Rules_Text'] = card_text['Rules_Text'].str.lower()

# create new dataframe with which to work
# ct = card_text['Rules_Text'].to_frame()

In [5]:
# # initialize whitespace tokenizer
# whitespace_tokenizer = RegexpTokenizer("\s+", gaps = True)

# # make new column for whitespace tokenizer
# ct['no_space_tokens'] = ct['Rules_Text'].apply(lambda x: whitespace_tokenizer.tokenize(x))

In [6]:
# # load english library with spacy
# nlp = spacy.load('en_core_web_sm')

# # set function to get spacy tokens
# def spacy_on_list(x):
#     spacy_list = [nlp(element) for element in x]
#     return spacy_list

# # make new column for spacy tokens
# ct['spacy_tokens'] = ct['no_space_tokens'].apply(lambda x: spacy_on_list(x))

In [7]:
# # save ct
# with open('ct_df', 'wb') as f:
#     pickle.dump(ct,f)
    
# load ct
with open('ct_df', 'rb') as f:
    ct = pickle.load(f)

In [8]:
# dummy fucntion to pass to custom_tfidf
def dummy_function(doc):
    return doc

# # save spacy tokens as list
# docs = ct['spacy_tokens'].to_list()

# # save spacy tokens as series
# doc_series = pd.Series(docs)

In [9]:
# set function to get lemmetized non-stop-words
def lemmy_no_stop(x):
    core_text = []
    for tuple_ in x:
        for token in tuple_:
            core_text.append(token.lemma_)  
    #core_text = [[token.lemma_ for token in tuple_ if token.is_stop == False] for tuple_ in x]  
    core_text = [ele for ele in core_text if ele != []]
    return core_text

# apply fucntion to create new column in ct
ct['core_text'] = ct['spacy_tokens'].apply(lambda x: lemmy_no_stop(x))

# convert that column to a list
core_docs = ct['core_text'].to_list()

In [10]:
# # initilize custom_tfidf from TfidfVectorizer
# custom_tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy_function, preprocessor=dummy_function, token_pattern=None)

# # fit transform docs to CX 
# CX = custom_tfidf.fit_transform(core_docs).toarray()

# # create a dataframe from CX and feature_names from custom_tfidf
# custom_tfidf_df = pd.DataFrame(CX, columns=custom_tfidf.get_feature_names())

In [11]:
custom_tfidf_2gram = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), tokenizer=dummy_function,
                                     preprocessor=dummy_function, token_pattern=None)
CX = custom_tfidf_2gram.fit_transform(core_docs).toarray()

In [12]:
# CX = StandardScaler().fit_transform(CX)
CX.shape

(26182, 30891)

In [13]:
pca = PCA(n_components = 1500)
pca.fit(CX)

def evr(array):
    _sum = 0
    for element in array:
        _sum += element
    return _sum

evr(pca.explained_variance_ratio_)

0.7664784564890407

In [14]:
CX = pca.transform(CX)

In [15]:
CX

array([[-4.10828445e-02, -5.22423294e-03, -6.17188071e-02, ...,
        -1.76216394e-02,  1.09819627e-02,  2.01045720e-04],
       [-9.04587180e-02,  6.65098151e-02, -6.14470896e-02, ...,
         2.12191412e-03,  8.76716127e-04,  3.40574646e-03],
       [-1.96656848e-02, -8.06216866e-02, -6.67142744e-03, ...,
        -2.61497953e-03,  9.95155407e-03, -1.67357788e-02],
       ...,
       [-2.61223313e-02, -7.90864718e-03, -3.75161847e-02, ...,
        -2.25631204e-04, -1.19345283e-04,  1.09465978e-04],
       [-3.50442468e-02,  3.48043848e-02,  4.36011825e-02, ...,
        -1.03814324e-02, -3.75289315e-03, -1.07726390e-02],
       [-5.23103600e-02,  9.74725238e-02,  7.82743663e-03, ...,
         2.01604570e-04,  9.36440368e-05, -2.98889045e-03]])

In [16]:
# create new dataframe for results

# card_name = "Kess, Dissident Mage"
card_name = "Merfolk Looter"

index = card_text.index[card_text['name'] == card_name]
distances = pairwise_distances(CX[index].reshape(1,-1), CX, metric='euclidean')

results_df = card_text[['name', 'Rules_Text', 'color_identity', 'mana_cost', 'type_line', 'rarity', 'set_name',
                                'set_type']]
results_df.insert(loc = 0, column = 'distances', value = pd.Series(distances[0]))

top10 = results_df[results_df['name'] != card_name]
top10['Rules_Text'] = top10['Rules_Text'].str.replace('\n', ', ')
card_text['oracle_text'] = card_text['oracle_text'].str.replace('\n', ', ')
top10 = top10.sort_values(by = 'distances', ascending = True).head(10).reset_index(drop=True)

# output results
print(f"Chosen Card:\t{card_text['name'].iloc[index[0]]}")
print(f"Mana Cost:\t{card_text['mana_cost'].iloc[index[0]]}")
print(f"Type:\t\t{card_text['type_line'].iloc[index[0]]}")
print(f"Text:\t\t{card_text['oracle_text'].iloc[index[0]]}\n{'*'*60}\n{'*'*60}")
for i in range(10):
    print(f"{i+1}.\nSimilarity:\t{(1-round(top10['distances'].values[i], 2))*100}%")
    print(f"Name:\t\t{top10['name'].values[i]}")
    print(f"Mana Cost:\t{top10['mana_cost'].values[i]}")
    print(f"Type:\t\t{top10['type_line'].values[i]}")
    print(f"Text:\t\t{top10['Rules_Text'].values[i]}\n{'-'*60}")

Chosen Card:	Merfolk Looter
Mana Cost:	{1}{U}
Type:		Creature — Merfolk Rogue
Text:		{T} Draw a card then discard a card
************************************************************
************************************************************
1.
Similarity:	100.0%
Name:		Thought Courier
Mana Cost:	{1}{U}
Type:		Creature — Human Wizard
Text:		{t} draw a card then discard a card
------------------------------------------------------------
2.
Similarity:	100.0%
Name:		The Harvester
Mana Cost:	
Type:		Hero
Text:		{t} draw a card then discard a card
------------------------------------------------------------
3.
Similarity:	56.00000000000001%
Name:		Facet Reader
Mana Cost:	{1}{U}
Type:		Creature — Human Wizard
Text:		{1} {t} draw a card then discard a card
------------------------------------------------------------
4.
Similarity:	56.00000000000001%
Name:		A-Dragonborn Looter
Mana Cost:	{U}
Type:		Creature — Dragon Rogue
Text:		{1} {t} draw a card then discard a card
-----------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10['Rules_Text'] = top10['Rules_Text'].str.replace('\n', ', ')


In [17]:
# Less than perfect matches for: Kess, Dissident Mage

# Karador, Ghost Chieftain