In [99]:
import pandas as pd
import re
import pickle
import spacy
import numpy as np
from spacy import displacy
import scattertext as st
import nltk
from nltk.util import ngrams
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [80]:
# load dataset

card_text = pd.read_json('oracle-cards-20220819210331.json')

In [81]:
# drop 69 unnecessary columns

drop_cols = ['object', 'id', 'oracle_id', 'multiverse_ids', 'mtgo_id', 'mtgo_foil_id', 'tcgplayer_id','cardmarket_id', 'lang', 
             'released_at', 'uri', 'scryfall_uri', 'layout', 'highres_image', 'image_status', 'image_uris', 'colors',
             'legalities', 'games', 'reserved', 'foil', 'nonfoil', 'finishes', 'oversized', 'promo', 'reprint', 'variation',
             'set_id', 'set', 'set_uri', 'set_search_uri', 'scryfall_set_uri', 'rulings_uri', 'prints_search_uri',
             'collector_number', 'digital', 'card_back_id', 'artist', 'artist_ids', 'illustration_id', 'border_color', 'frame',
             'full_art', 'textless', 'booster', 'story_spotlight', 'prices', 'related_uris', 'security_stamp', 'preview',
             'power', 'toughness', 'penny_rank', 'arena_id', 'watermark', 'produced_mana', 'all_parts', 'card_faces',
             'frame_effects', 'tcgplayer_etched_id', 'promo_types', 'loyalty', 'life_modifier', 'hand_modifier',
             'color_indicator', 'content_warning', 'cmc', 'keywords', 'flavor_text', 'edhrec_rank']

card_text = card_text.drop(columns = drop_cols)

In [82]:
# clean

# fill NaNs
card_text['oracle_text'] = card_text['oracle_text'].fillna('None')

# drop useless rows
vangard_mask = ((card_text['set_type'] == 'vanguard') |
                (card_text['type_line'] == 'Vanguard') |
                (card_text['set_name'].str.contains('Vanguard')))
card_text.drop(card_text[vangard_mask].index, inplace = True)
card_text.drop(card_text[card_text['type_line'].str.contains('Emblem')].index, inplace = True)
card_text.reset_index(drop = True, inplace = True)

# emliminate extraneous characters
unwanted_text = "[.,!?()•—\\:;|]"
card_text['oracle_text'] = card_text['oracle_text'].map(lambda x: re.sub(unwanted_text, '', x))

# remove all references of a card's name from it's rules text
card_text['Rules_Text'] = card_text[['name', 'oracle_text']].apply(lambda x: 
                                                                     x['oracle_text'].replace(x['name'], 'This_Card')
                                                                     if x['name'] in x['oracle_text']
                                                                     else x['oracle_text'],
                                                                     axis = 1)

# set rules_text to lower case
card_text['Rules_Text'] = card_text['Rules_Text'].str.lower()

# create new dataframe with which to work
# ct = card_text['Rules_Text'].to_frame()

In [83]:
# # initialize whitespace tokenizer
# whitespace_tokenizer = RegexpTokenizer("\s+", gaps = True)

# # make new column for whitespace tokenizer
# ct['no_space_tokens'] = ct['Rules_Text'].apply(lambda x: whitespace_tokenizer.tokenize(x))

In [84]:
# !python -m spacy download en_core_web_sm

In [85]:
# # load english library with spacy
nlp = spacy.load('en_core_web_sm')

# # set function to get spacy tokens
# def spacy_on_list(x):
#     spacy_list = [nlp(element) for element in x]
#     return spacy_list

# # make new column for spacy tokens
# ct['spacy_tokens'] = ct['no_space_tokens'].apply(lambda x: spacy_on_list(x))

In [86]:
# # save ct
# with open('ct_df', 'wb') as f:
#     pickle.dump(ct,f)
    
# load ct
with open('ct_df', 'rb') as f:
    ct = pickle.load(f)

In [87]:
# dummy fucntion to pass to custom_tfidf
def dummy_function(doc):
    return doc

# # save spacy tokens as list
# docs = ct['spacy_tokens'].to_list()

# # save spacy tokens as series
# doc_series = pd.Series(docs)

In [88]:
# set function to get lemmetized non-stop-words
def lemmy_no_stop(x):
    core_text = []
    for tuple_ in x:
        for token in tuple_:
            core_text.append(token.lemma_)  
    #core_text = [[token.lemma_ for token in tuple_ if token.is_stop == False] for tuple_ in x]  
    core_text = [ele for ele in core_text if ele != []]
    return core_text

# apply fucntion to create new column in ct
ct['core_text'] = ct['spacy_tokens'].apply(lambda x: lemmy_no_stop(x))

# convert that column to a list
core_docs = ct['core_text'].to_list()

In [89]:
card_text['Core_Text'] = ct['core_text']

In [None]:
# Removing cards that don't have rules text

In [100]:
card_text['Core_Text'] = card_text['Core_Text'].apply(lambda x: np.nan if x == [] else x)

In [102]:
card_text.dropna(subset=['Core_Text'], inplace=True)

In [103]:
len(card_text)

25604

In [90]:
# To use scattertext, need to categorize data into two categories
# Since we are evaluating Kess, the two categories will be if it contains the Kess effect and if it does not
# Kess effect: Cast an instant or sorcery spell from your graveyard.
#              If a spell cast this way would be put into your graveyard exile it instead.
# kessLike_condition = ['graveyard', 'exile', 'instead']

In [91]:
# card_text['Kess_Effect'] = card_text['Core_Text'].apply(lambda x: all(elem in x for elem in kessLike_condition)).map({True: "KessLike", False: "Not_KessLike"})

In [117]:
self_ref_condition = 'this_card'

In [118]:
def self_ref_test(x):
    if self_ref_condition in x:
        return True
    else:
        return False

In [119]:
card_text['Self_Ref'] = card_text['Core_Text'].apply(lambda x: self_ref_test(x)).map({True: "Refs_Self", False: "No_Ref_Self"})

In [123]:
# Seeing how many cards reference themselves
refCount = 0
for x in card_text['Self_Ref']:
    if x == 'Refs_Self':
        refCount += 1
print(refCount)

11325


In [124]:
# Seeing how many cards don't have rules text
emptyCount = 0
for x in card_text['Core_Text']:
    if x == []:
        emptyCount +=1
print(emptyCount)

0


In [125]:
# Building the ScatterText corpus:
corpus = st.CorpusFromPandas(
    card_text,
    category_col = 'Self_Ref',
    text_col = 'Rules_Text',
    nlp=nlp
).build()

In [126]:
len(card_text)

25604

In [130]:
corpus.get_categories()

['Refs_Self', 'No_Ref_Self']

In [136]:
len(card_text['name'])

25604

In [132]:
# Error handling:
assert type(corpus) == st.CorpusDF, "Be sure to create your corpus from the dataframe provided.  It should be a scattertext CorpusDF object."
assert corpus.get_num_docs() == len(card_text), "Your corpus should be constructed from card_text and should have the same number of documents as card_text."
assert corpus.get_categories() == ['Refs_Self', 'No_Ref_Self'], "The categories of your corpus should be 'Refs_Self' and 'No_Ref_Self'."

In [137]:
# Creating the ScatterText html:
html = st.produce_scattertext_explorer(
        corpus,
        category="Refs_Self",
        category_name='Refs_Self',
        not_category_name='No_Ref_Self',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=card_text['name'],
        )

In [139]:
# This creates an HTML document in your current Notebook directory. Find it and click on it to view the interactive plot
open('self_reference_cards.html', 'wb').write(html.encode('utf-8'));