In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import spacy
from spacy import displacy
import nltk
from nltk.util import ngrams
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load dataset

card_text = pd.read_json('oracle-cards-20220819210331.json')

In [3]:
# drop 69 unnecessary columns

drop_cols = ['object', 'id', 'oracle_id', 'multiverse_ids', 'mtgo_id', 'mtgo_foil_id', 'tcgplayer_id','cardmarket_id', 'lang', 
             'released_at', 'uri', 'scryfall_uri', 'layout', 'highres_image', 'image_status', 'image_uris', 'colors',
             'legalities', 'games', 'reserved', 'foil', 'nonfoil', 'finishes', 'oversized', 'promo', 'reprint', 'variation',
             'set_id', 'set', 'set_uri', 'set_search_uri', 'scryfall_set_uri', 'rulings_uri', 'prints_search_uri',
             'collector_number', 'digital', 'card_back_id', 'artist', 'artist_ids', 'illustration_id', 'border_color', 'frame',
             'full_art', 'textless', 'booster', 'story_spotlight', 'prices', 'related_uris', 'security_stamp', 'preview',
             'power', 'toughness', 'penny_rank', 'arena_id', 'watermark', 'produced_mana', 'all_parts', 'card_faces',
             'frame_effects', 'tcgplayer_etched_id', 'promo_types', 'loyalty', 'life_modifier', 'hand_modifier',
             'color_indicator', 'content_warning', 'cmc', 'keywords', 'flavor_text', 'edhrec_rank']

card_text = card_text.drop(columns = drop_cols)

In [4]:
# clean

# fill NaNs
card_text['oracle_text'] = card_text['oracle_text'].fillna('None')

# drop useless rows
vangard_mask = ((card_text['set_type'] == 'vanguard') |
                (card_text['type_line'] == 'Vanguard') |
                (card_text['set_name'].str.contains('Vanguard')))
card_text.drop(card_text[vangard_mask].index, inplace = True)
card_text.drop(card_text[card_text['type_line'].str.contains('Emblem')].index, inplace = True)
card_text.reset_index(drop = True, inplace = True)

# emliminate extraneous characters
unwanted_text = "[.,!?()•—\\:;|]"
card_text['oracle_text'] = card_text['oracle_text'].map(lambda x: re.sub(unwanted_text, '', x))

# remove all references of a card's name from it's rules text
card_text['Rules_Text'] = card_text[['name', 'oracle_text']].apply(lambda x: 
                                                                     x['oracle_text'].replace(x['name'], 'This_Card')
                                                                     if x['name'] in x['oracle_text']
                                                                     else x['oracle_text'],
                                                                     axis = 1)

# set rules_text to lower case
card_text['Rules_Text'] = card_text['Rules_Text'].str.lower()

# # create new dataframe with which to work
# ct = card_text['Rules_Text'].to_frame()

In [5]:
# # initialize whitespace tokenizer
# whitespace_tokenizer = RegexpTokenizer("\s+", gaps = True)

# # make new column for whitespace tokenizer
# ct['no_space_tokens'] = ct['Rules_Text'].apply(lambda x: whitespace_tokenizer.tokenize(x))

In [6]:
# # load english library with spacy
# nlp = spacy.load('en_core_web_sm')

# # set function to get spacy tokens
# def spacy_on_list(x):
#     spacy_list = [nlp(element) for element in x]
#     return spacy_list

# # make new column for spacy tokens
# ct['spacy_tokens'] = ct['no_space_tokens'].apply(lambda x: spacy_on_list(x))

In [7]:
# save ct
# with open('ct_df', 'wb') as f:
#     pickle.dump(ct,f)
    
# load ct
with open('ct_df', 'rb') as f:
    ct = pickle.load(f)

In [8]:
# dummy fucntion to pass to custom_tfidf
def dummy_function(doc):
    return doc

# # save spacy tokens as list
# docs = ct['spacy_tokens'].to_list()

# # save spacy tokens as series
# doc_series = pd.Series(docs)

In [9]:
# set function to get lemmetized non-stop-words
def lemmy_no_stop(x):
    core_text = []
    for tuple_ in x:
        for token in tuple_:
            core_text.append(token.lemma_)  
    #core_text = [[token.lemma_ for token in tuple_ if token.is_stop == False] for tuple_ in x]  
    core_text = [ele for ele in core_text if ele != []]
    return core_text

# apply fucntion to create new column in ct
ct['core_text'] = ct['spacy_tokens'].apply(lambda x: lemmy_no_stop(x))

# convert that column to a list
core_docs = ct['core_text'].to_list()

In [10]:
custom_tfidf_2gram = TfidfVectorizer(analyzer='word', tokenizer=dummy_function,
                                     preprocessor=dummy_function, token_pattern=None)
CX_2gram = custom_tfidf_2gram.fit_transform(core_docs).toarray()

In [11]:
# CX_2gram = StandardScaler().fit_transform(CX_2gram)
CX_2gram.shape

(26182, 4681)

In [12]:
pca = PCA(n_components = 150)
pca.fit(CX_2gram)
# pca.explained_variance_ratio_

def evr (array):
    _sum = 0
    for element in array:
        _sum += element
    return _sum

evr(pca.explained_variance_ratio_)

0.7448775355129494

In [13]:
CX_2gram = pca.transform(CX_2gram)
np_CX = np.array(CX_2gram)
non_neg_CX_2gram = np_CX + 1

In [14]:
nmf_2gram = NMF(20)
nmf_2gram = nmf_2gram.fit(non_neg_CX_2gram)
docs_topics_2gram = nmf_2gram.transform(non_neg_CX_2gram)
topics_terms_2gram = nmf_2gram.components_



In [15]:
# nmf_2gram_df = pd.DataFrame(docs_topics_2gram, columns = ['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5',
#                                                           'Topic_6', 'Topic_7', 'Topic_8', 'Topic_9', 'Topic_10', 'Topic_11',
#                                                           'Topic_12', 'Topic_13', 'Topic_14', 'Topic_15', 'Topic_16',
#                                                           'Topic_17', 'Topic_18', 'Topic_19'])

In [16]:
# results_df = card_text[['name', 'Rules_Text', 'color_identity', 'mana_cost', 'type_line', 'rarity', 'set_name',
#                                 'set_type']]
# topics = nmf_2gram_df.merge(results_df, left_index=True, right_index=True)

In [17]:
# droppers = ['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8',
#             'Topic_9', 'Topic_19', 'Topic_11', 'Topic_12', 'Topic_13', 'Topic_14', 'Topic_15', 'Topic_16', 'Topic_17',
#             'Topic_18']

# target_topic = 'Topic_10'

# # top10 = top10.sort_values(by = 'distances', ascending = True).head(10).reset_index(drop=True)
# topic_mask = topics[target_topic] != 0
# pd.set_option('display.max_colwidth', None)
# topics[topic_mask].drop(columns = droppers, axis = 1).sort_values(by = target_topic, ascending = False).head(20)

In [18]:
# 0: **garbage**
# 1: tap for mana
# 2: +1/+1 till end of turn
# 3: ETB
# 4: look at top cards of library
# 5: enchant creature
# 6: **garbage**
# 7: enchant creature
# 8: **garbage**
# 9: reanimate
# 10: **garbage**
# 11: upkeep trigger
# 12: face down
# 13: stuff you control
# 14: prevent damage
# 15: bounce
# 16: tutor
# 17: prevent damage
# 18: cost reduction
# 19: discard/draw

In [20]:
# create new dataframe for results

# card_name = "Kess, Dissident Mage"
card_name = "Merfolk Looter"

index = card_text.index[card_text['name'] == card_name]
distances = pairwise_distances(docs_topics_2gram[index].reshape(1,-1), docs_topics_2gram, metric='cosine')

results_df = card_text[['name', 'Rules_Text', 'color_identity', 'mana_cost', 'type_line', 'rarity', 'set_name', 'set_type']]
results_df.insert(loc = 0, column = 'distances', value = pd.Series(distances[0]))

top10 = results_df[results_df['name'] != card_name]
top10['Rules_Text'] = top10['Rules_Text'].str.replace('\n', ', ')
card_text['oracle_text'] = card_text['oracle_text'].str.replace('\n', ', ')
top10 = top10.sort_values(by = 'distances', ascending = True).head(10).reset_index(drop=True)

# output results
print(f"Chosen Card:\t{card_text['name'].iloc[index[0]]}")
print(f"Mana Cost:\t{card_text['mana_cost'].iloc[index[0]]}")
print(f"Type:\t\t{card_text['type_line'].iloc[index[0]]}")
print(f"Text:\t\t{card_text['oracle_text'].iloc[index[0]]}\n{'*'*60}\n{'*'*60}")
for i in range(10):
    print(f"{i+1}.")
#     print(f"Similarity:\t{top10['distances'].values[i]}")
    print(f"Similarity:\t{round((1-top10['distances'].values[i]), 4)*100}%")
    print(f"Name:\t\t{top10['name'].values[i]}")
    print(f"Mana Cost:\t{top10['mana_cost'].values[i]}")
    print(f"Type:\t\t{top10['type_line'].values[i]}")
    print(f"Text:\t\t{top10['Rules_Text'].values[i]}\n{'-'*60}")

Chosen Card:	Merfolk Looter
Mana Cost:	{1}{U}
Type:		Creature — Merfolk Rogue
Text:		{T} Draw a card then discard a card
************************************************************
************************************************************
1.
Similarity:	100.0%
Name:		Thought Courier
Mana Cost:	{1}{U}
Type:		Creature — Human Wizard
Text:		{t} draw a card then discard a card
------------------------------------------------------------
2.
Similarity:	100.0%
Name:		The Harvester
Mana Cost:	
Type:		Hero
Text:		{t} draw a card then discard a card
------------------------------------------------------------
3.
Similarity:	99.96000000000001%
Name:		Rummaging Goblin
Mana Cost:	{2}{R}
Type:		Creature — Goblin Rogue
Text:		{t} discard a card draw a card
------------------------------------------------------------
4.
Similarity:	99.92999999999999%
Name:		Mad Prophet
Mana Cost:	{3}{R}
Type:		Creature — Human Shaman
Text:		haste, {t} discard a card draw a card
-----------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10['Rules_Text'] = top10['Rules_Text'].str.replace('\n', ', ')


In [None]:
# Less than perfect matches for: Kess, Dissident Mage

# Fury of the Horde
# Into the Fray
# Dead Revels
# Omniscience

In [None]:
# nmf_2gram_10_df = pd.DataFrame(docs_topics_2gram, columns = ['ETB',
#                                                              'until_end_of_turn',
#                                                              'burn',
#                                                              'scry_effect',
#                                                              '+1/+1_counters',
#                                                              'unblockable',
#                                                              'reanimation',
#                                                              'upkeep_trigger',
#                                                              'card_advantage',
#                                                              'enchant_creature',
#                                                              'face_down',
#                                                              'mana_ability',
#                                                              'buff_creatures',
#                                                              'prevent_damage',
#                                                              'tutor',
#                                                              'sacrifice_cost',
#                                                              'destroy_target',
#                                                              'stats_equal_to_X',
#                                                              'bounce',
#                                                              'lifegain'])

In [None]:
# incorporate topic ratio per output
# find size of each topic
# streamlit