In [None]:
import numpy as np
import pandas as pd
import json
import re


In [None]:
#pd.options.display.max_columns=None
#pd.options.display.max_rows=None
#pd.options.display.max_colwidth=250
#pd.options.display.max_seq_items=500

# Summarise key steps:


In [None]:
# Import data
# Data sourced from: https://mtgjson.com/api/v5/AllPrintingsCSVFiles.zip
card_dtypes = {
    'colors':np.object,
    'faceConvertedManaCost':np.object,
    'flavorText': np.object,
    'frameEffects': np.object,
    'leadershipSkills': np.object,
    'name': np.object,
    'text': np.object,
}

df_base = pd.read_csv("../resources/cards.csv",dtype=card_dtypes,low_memory=False)

# Remove key rows
df_base = df_base[(df_base.isOnlineOnly == 0)]
df_base = df_base[(df_base.isOversized == 0)]
df_base = df_base[(df_base.isPromo == 0)]
df_base = df_base[~(df_base.layout == 'vanguard')]


# Keep fields likely to support data feature build
df = df_base[[
    'index',
    'id',
    'colorIdentity',
    'colorIndicator',
    'colors',
    'convertedManaCost',
    'faceConvertedManaCost',
    'faceName',
    'flavorText',
    'hand',
    'hasAlternativeDeckLimit',
    'isOnlineOnly',
    'isOversized',
    'isPromo',
    'isReprint',
    'isReserved',
    'isStarter',
    'isTextless',
    'keywords',
    'layout',
    'leadershipSkills',
    'life',
    'loyalty',
    'manaCost',
    'multiverseId',
    'name',
    'number',
    'otherFaceIds',
    'power',
    'printings',
    'rarity',
    'setCode',
    'side',
    'subtypes',
    'supertypes',
    'text',
    'toughness',
    'type',
    'types',
    'uuid',
    'variations',
    'watermark'
]].copy()

############################################################
############################################################

# Create unique row per card name / allowing for multiple faces (i.e. names may be duplicated)
# 'side' needs to be filled in or groupby portion of statement doesn't work properly
df['side'].fillna('normal',inplace=True)
df['name_row'] = df.sort_values(by='id',ascending=True).groupby(['name','side']).cumcount() + 1
df = df[(df['name_row'] == 1)]

# Flag double layout cards
df['double_layout'] = 1
df['double_layout'].where(df['layout'].isin(['transform','split','adventure','modal_dfc','flip','aftermath','meld']),0,inplace=True)


############################################################
############################################################


# Add in mana cost counts
df['manaCost_NA'] = df.manaCost.isna()*1 # Column to flag NA values for manaCost
df['manaCost'].fillna('{none}',inplace=True) # Use '{none}' in lower case, since all other manaCost letters in upper case.
df['manaCost_Generic_count'] = np.where(df.manaCost.str.contains('\{[\d]+?\}'),df.manaCost.str.extract('\{([\d]+?)\}',expand=False),0)
df['manaCost_W_count'] = df.manaCost.str.count('{W}')
df['manaCost_U_count'] = df.manaCost.str.count('{U}')
df['manaCost_B_count'] = df.manaCost.str.count('{B}')
df['manaCost_R_count'] = df.manaCost.str.count('{R}')
df['manaCost_G_count'] = df.manaCost.str.count('{G}')
df['manaCost_C_count'] = df.manaCost.str.count('{C}')
df['manaCost_WP_count'] = df.manaCost.str.count('{W/P}')
df['manaCost_UP_count'] = df.manaCost.str.count('{B/P}')
df['manaCost_BP_count'] = df.manaCost.str.count('{U/P}')
df['manaCost_RP_count'] = df.manaCost.str.count('{R/P}')
df['manaCost_GP_count'] = df.manaCost.str.count('{G/P}')
df['manaCost_H_WU_count'] = df.manaCost.str.count('{W/U}')
df['manaCost_H_UB_count'] = df.manaCost.str.count('{U/B}')
df['manaCost_H_BR_count'] = df.manaCost.str.count('{B/R}')
df['manaCost_H_RG_count'] = df.manaCost.str.count('{R/G}')
df['manaCost_H_GW_count'] = df.manaCost.str.count('{G/W}')
df['manaCost_H_WB_count'] = df.manaCost.str.count('{W/B}')
df['manaCost_H_UR_count'] = df.manaCost.str.count('{U/R}')
df['manaCost_H_BG_count'] = df.manaCost.str.count('{B/G}')
df['manaCost_H_RW_count'] = df.manaCost.str.count('{R/W}')
df['manaCost_H_GU_count'] = df.manaCost.str.count('{G/U}')
df['manaCost_H_2W_count'] = df.manaCost.str.count('{2/W}')
df['manaCost_H_2U_count'] = df.manaCost.str.count('{2/U}')
df['manaCost_H_2B_count'] = df.manaCost.str.count('{2/B}')
df['manaCost_H_2R_count'] = df.manaCost.str.count('{2/R}')
df['manaCost_H_2G_count'] = df.manaCost.str.count('{2/G}')
df['manaCost_X_count'] = df.manaCost.str.count('{X}')
df['manaCost_Y_count'] = df.manaCost.str.count('{Y}')
df['manaCost_Z_count'] = df.manaCost.str.count('{Z}')
df['manaCost_Snow_count'] = df.manaCost.str.count('{S}')
df['manaCost_HW_count'] = df.manaCost.str.count('{HW}')

############################################################
############################################################

# OneHot Encode all keywords in the data, and add some other info related to keywords
df['keywords_NA'] = df.keywords.isna()*1
df.keywords.fillna('{none}',inplace=True)
df['keywords_count'] = [len(i) for i in df.keywords.str.split(',').tolist()] * np.where(df.keywords_NA,0,1)


all_keywords = df.keywords.str.split(",").tolist()
unique_keywords = []

for i in all_keywords:
    for j in i:
        if j != '{none}':
            j.capitalize()
            unique_keywords.append(j)
unique_keywords = set(unique_keywords)
unique_keywords = list(unique_keywords)
unique_keywords.sort()

for keyword in unique_keywords:
    col_name = 'keyword_' + keyword.replace(' ','_')
    df[col_name] = df.keywords.str.contains(keyword) * 1

############################################################
############################################################


# Other effects ############################################################

df['text_NA'] = df.keywords.isna()*1
df.text.fillna('{none}',inplace=True)

# Drawing cards (generally a benefit)
df_draw_cards = df.text.str.extract('[Dd]raw(?!\s[Ss]tep)\s(.*?)card?')
df_draw_cards.rename({0:'extract_text'},axis=1,inplace=True)
df_draw_cards.fillna(0,inplace=True)

cond = [
    df_draw_cards['extract_text'].str[0] == 'a',
    df_draw_cards['extract_text'].str[0:3] == 'two',
    df_draw_cards['extract_text'].str[0:5] == 'three',
    df_draw_cards['extract_text'].str[0:4] == 'four',
    df_draw_cards['extract_text'].str[0:4] == 'five',
    df_draw_cards['extract_text'].str[0:3] == 'six',
    df_draw_cards['extract_text'].str[0:5] == 'seven',
    df_draw_cards['extract_text'].str[0:5] == 'eight',
    df_draw_cards['extract_text'].str[0:4] == 'nine',
    df_draw_cards['extract_text'].str[0:5] == 'half X',
    df_draw_cards['extract_text'].str[0:0] == 'X'
]

output = [1,2,3,4,5,6,7,8,9,15,20]

df['effect_draw_cards'] = np.select(cond,output,default=0)

# Beneficial discards (i.e. apply to opponent / target player - since generally you choose your opponent, unless you're drawing cards)
df_extract = (df.text.str.contains('[Tt]arget\s(opponent|player).*[Dd]iscards\s.*?card?',case=False))
df['effect_discard_target_player'] = df_extract*1

# Discard own cards (cost)
df_extract = (df.text.str.contains('Discard\s.*?card?',case=False))
df['effect_discard_own_cards'] = df_extract*1

# Loot ability dummy variable
df_extract = (df.text.str.contains('[Dd]raw a card, then discard a card',case=False))
df['effect_loot'] = df_extract*1

# Destroy effects / exile effects
# Note - are just treating destroy and exile as identical effects for now, for the purpose of getting a model working
#        ideally would split these up, and allow for some more nuance
# Note - need to go back and check interaction of nonland and permanent to make sure it is handle properly
df_extract_nonland = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?nonland(?=\.|\s)?')*1
df_extract_permanent = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*(?<!nonland\s)permanent(?=\.|\s)?')*1

df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*artifact?(\.|\s)')*1
df['effect_destroy_artifact'] = df_extract + df_extract_nonland + df_extract_permanent

df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*creature?(\.|\s)')*1
df['effect_destroy_creature'] = df_extract + df_extract_nonland + df_extract_permanent

df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?(?<!is)(?<!non)land(?!walk)(?=\.|\s)?')*1
df['effect_destroy_land'] = df_extract + df_extract_permanent

df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?enchantment(?=\.|\s)?')*1
df['effect_destroy_enchantment'] = df_extract + df_extract_nonland + df_extract_permanent

df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?planeswalker(?=\.|\s)?')*1
df['effect_destroy_planeswalker'] = df_extract + df_extract_nonland + df_extract_permanent

# Destroying 'all' creatures
df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*all.*creatures(?=\.|\s)?')*1
df['effect_destroy_all_creatures'] = df_extract

# Deals damage effects
# Focus on damage to others, and excludes comabat damage to... triggers
df_extract = df.text.str.contains('(deals)+\s[\dX]*.*(?!combat\s)(damage)\sto(?!\syou)')
df['effect_deals_damage'] = df_extract*1

# Counter spell effects
df_extract = df.text.str.contains('[Cc]ounter.*spell')
df['effect_counter_target_spell'] = df_extract*1

# Enters the battlefield effect
df_extract = df.text.str.contains('[Ee]nter(s)?\sthe\sbattlefield')
df['effect_enter_the_battlefield'] = df_extract*1

df_extract = df.text.str.contains('[Ee]nter(s)?\sthe\sbattlefield.*[Ss]acrifice\sit')
df['effect_enter_the_battlefield_sacrific_it'] = df_extract*1

# Activate ability as an effect
df_extract = df.text.str.count('.*:.*')
df['effect_has_activated_ability'] = df_extract*1

############################################################
############################################################

# Set up base lines for efficiency metrics

# Power + Toughness 
# P+T Clean up power
df.power.fillna('{none}',inplace=True)
df['power_clean'] = 0
df['power_clean'] = np.where(df.power.str.contains('\D(?<![{noe}])'),1,0)
df['power_clean'] = [max(i/2,1) for i in df['convertedManaCost'].tolist()] * np.float64(df['power_clean'])
df['power_clean'] = np.where(df['power_clean']==0,df['power'],df['power_clean'])
df['power_clean'] = np.where(df['power_clean']=='{none}',0,df['power_clean'])

# P+T Clean up toughness

df.toughness.fillna('{none}',inplace=True)
df['toughness_clean'] = 0
df['toughness_clean'] = np.where(df.toughness.str.contains('\D(?<![{noe}])'),1,0)
df['toughness_clean'] = [max(i/2,1) for i in df['convertedManaCost'].tolist()] * np.float64(df['toughness_clean'])
df['toughness_clean'] = np.where(df['toughness_clean']==0,df['toughness'],df['toughness_clean'])
df['toughness_clean'] = np.where(df['toughness_clean']=='{none}',0,df['toughness_clean'])

# P+T Calculation
df['power_plus_toughness'] = np.float64(df['power_clean']) + np.float64(df['toughness_clean'])
df['power_plus_toughness']  = np.float64(df['power_plus_toughness'])

# Count keywords and effects
def sum_columns_starting_with(df, col_name_str):
    col_name_match_len = len(col_name_str)
    col_list = [i for i in df.columns.tolist() if str(i)[0:col_name_match_len] == col_name_str]
    sum_values = []
    temp_list = []
    for i in col_list:
        if sum_values==[]:
            sum_values = df[i].tolist()
        else:
            sum_values = [a+b for a,b in zip(sum_values, df[i])]
    return sum_values

df['keyword_count'] = sum_columns_starting_with(df,'keyword_')
df['effect_count'] = sum_columns_starting_with(df,'effect_')

# Calculate effieciency ratings
df['efficiency_power'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['power_clean'])/df['convertedManaCost'],0)
df['efficiency_toughness'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['toughness_clean'])/df['convertedManaCost'],0)
df['efficiency_p_plus_t'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['power_plus_toughness'])/df['convertedManaCost'],0)
df['efficiency_keywords'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['keyword_count'])/df['convertedManaCost'],0)
df['efficiency_effects'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['effect_count'])/df['convertedManaCost'],0)

def max_columns_starting_with(df, col_name_str):
    col_name_match_len = len(col_name_str)
    col_list = [i for i in df.columns.tolist() if str(i)[0:col_name_match_len] == col_name_str]
    max_values = []
    temp_list = []
    for i in col_list:
        if max_values==[]:
            max_values = df[i].tolist()
        else:
            max_values = [max(a,b) for a,b in zip(max_values, df[i])]
    return max_values

df['efficiency_power'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_power'])
df['efficiency_toughness'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_toughness'])
df['efficiency_p_plus_t'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_p_plus_t'])
df['efficiency_keywords'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_keywords'])
df['efficiency_effects'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_effects'])

df['efficiency_max'] = max_columns_starting_with(df,'efficiency_')



print('Card metrics data frame ready')


In [None]:
df.columns.tolist()

In [None]:
# Some checks!
df[(df['name']=='Shivan Dragon')]
# df[df.name=='Wear // Tear']

# Play / workings

In [None]:
# Import data
card_dtypes = {
    'colors':np.object,
    'faceConvertedManaCost':np.object,
    'flavorText': np.object,
    'frameEffects': np.object,
    'leadershipSkills': np.object,
    'name': np.object,
    'text': np.object,
}

df_base = pd.read_csv("../resources/cards.csv",dtype=card_dtypes,low_memory=False)



In [None]:
df_base.info()

# Things to to do

## Remove before isolating rows:
**Remove out of scope cards**
* isOnlineOnly == 1
* isOversized == 1
* isPromo == 1
* layout == 'Vanguard'

**Address nan issues:**
* side

In [None]:
df_base.shape

In [None]:
df_base = df_base[(df_base.isOnlineOnly == 0)] # Removes online only cards so they can't end up in final data set

In [None]:
df_base = df_base[(df_base.isOversized == 0)] # Removes oversized versions of cards so they can't end up in final data set

In [None]:
df_base = df_base[(df_base.isPromo == 0)] # Removes promo versions of cards so they can't end up in final data set

In [None]:
df_base = df_base[~(df_base.layout == 'vanguard')].shape # Removes vanguard versions of cards so they can't end up in final data set

In [None]:
df_base[(df_base['name']=='Shivan Dragon')]

In [None]:
# Keep fields likely to support data feature build
df = df_base[[
    'index',
    'id',
    'colorIdentity',
    'colorIndicator',
    'colors',
    'convertedManaCost',
    'faceConvertedManaCost',
    'faceName',
    'flavorText',
    'hand',
    'hasAlternativeDeckLimit',
    'isOnlineOnly',
    'isOversized',
    'isPromo',
    'isReprint',
    'isReserved',
    'isStarter',
    'isTextless',
    'keywords',
    'layout',
    'leadershipSkills',
    'life',
    'loyalty',
    'manaCost',
    'multiverseId',
    'name',
    'number',
    'otherFaceIds',
    'power',
    'printings',
    'rarity',
    'setCode',
    'side',
    'subtypes',
    'supertypes',
    'text',
    'toughness',
    'type',
    'types',
    'uuid',
    'variations',
    'watermark'
]].copy()

In [None]:
df[df.name=='Shivan Dragon']

In [None]:
df[df.name=='Wear // Tear']

In [None]:
df[df.name=="Jace, Vryn's Prodigy // Jace, Telepath Unbound"]

In [None]:
df[df.name=="Jace, Vryn's Prodigy // Jace, Telepath Unbound"].groupby(['name','side']).cumcount()+1

In [None]:
df['side'].fillna('normal',inplace=True)

In [None]:
# Add an column that add rownumbers based on the combination of name / side
df['name_row'] = df.sort_values(by='id',ascending=True).groupby(['name','side']).cumcount() + 1

In [None]:
df[df.name=="Shivan Dragon"].isnull().sum()

In [None]:
df = df[(df['name_row']==1)].copy()

In [None]:
df.isnull().sum()

In [None]:
df[(df['colorIdentity'].isnull())]

In [None]:
df[(df['isOnlineOnly']==1)]

In [None]:
df['faceConvertedManaCost'].value_counts()

In [None]:
df[(df['faceConvertedManaCost'].notna())]

In [None]:
df['layout'].value_counts()

## Need to find a way to handle double faced cards in a single row

In [None]:
df['side'].value_counts()

In [None]:
df[(df['otherFaceIds'].notna())]

In [None]:
df[(df['uuid'].isin(['71a0621f-32a6-5450-8ad8-6cdae505cf59']))]

In [None]:
df[df['layout']=='transform']

In [None]:
df[df['layout']=='split']

In [None]:
#df[df['layout']=='adventure']
df[df['name']=='Brazen Borrower // Petty Theft']

In [None]:
#df[df['layout']=='modal_dfc']
df[df['name']=='Valakut Awakening // Valakut Stoneforge']

In [None]:
#df[df['layout']=='flip']
df[df['name']=="Sasaya, Orochi Ascendant // Sasaya's Essence"]

In [None]:
#df[df['layout']=='aftermath']
df[df['name']=="Commit // Memory"]

In [None]:
#df[df['layout']=='leveler']
df[df['name']=="Echo Mage"]

In [None]:
df[df['name']=="Echo Mage"]['text'].tolist()

In [None]:
#df[df['layout']=='saga']
df[df['name']=="Elspeth Conquers Death"]

In [None]:
df[df['name']=="Elspeth Conquers Death"]['text'].tolist()

In [None]:
#df[df['layout']=='host']
df[df['name']=="Angelic Rocket"]

In [None]:
#df[df['layout']=='augment']
df[df['name']=="Half-Kitten, Half-"]

In [None]:
#df[df['layout']=='meld']
df[(
    df['name']=="Midnight Scavengers // Chittering Host") 
    | (df['name']=="Graf Rats // Chittering Host")
    | (df['name']=="Chittering Host")
    ]

In [None]:
df[df['layout']=='vanguard']

# Function - Combine two cards to one line

Considred a function that combines two rows into one.

Given the complexity of this rapidly increases, instead the data will be flagged based on layout.

In [None]:
df['double_layout'] = 1
df['double_layout'].where(df['layout'].isin(['transform','split','adventure','modal_dfc','flip','aftermath','meld']),0,inplace=True)

In [None]:
df['double_layout'].sum() 

In [None]:
df[(df['double_layout'] ==1)]


# Create function to handle mana cost flags

In a set of `{X}`

**Types of mana - General:**
* White / W
* Blue / U
* Black / B
* Red / R
* Green / G
* Colourless / C
* Generic / i (i is a number)

**Types of mana - Special:**
* Phyrexian / P - Alternate cost is to pay 2 life
* Hybrid / of the form `{a/b}` where b is another form of mana
* X / Y - Variable amount of generic mana, sometimes limited to specific colours by rules text



In [None]:
a = df.manaCost.value_counts().reset_index()['index'].tolist()
mana_cost_string = ''

for item in a:
    mana_cost_string = mana_cost_string + item

print(mana_cost_string)

In [None]:
pat = re.compile("\{[\w]+/*?[\w]*?}")
# pat.match(mana_cost_string)
manaCost_items = pat.findall(mana_cost_string)
manaCost_items = set(manaCost_items)
manaCost_items = list(manaCost_items)
manaCost_items.sort()
manaCost_items

## Mana cost items that needed further checks:
 '{C}' : Confirmed as colourless mana 
 
 '{HW}' : Confirmed as half a white mana used in 'Un' sets
 
 
 '{S}' : Confirmed as Snow mana, e.g. Arcan's Astrolabe
 
 
 '{Y}','{Z}'


In [None]:
df.manaCost.isna().sum()

In [None]:
df['manaCost_NA'] = df.manaCost.isna()*1 # Column to flag NA values for manaCost

In [None]:
df['manaCost'].fillna('{none}',inplace=True) # Use '{none}' in lower case, since all other manaCost letters in upper case.

In [None]:
df[df.manaCost.str.contains("{C}")] # Confirmed C is definitely for colourless mana

In [None]:
df[df.manaCost.str.contains("{HW}")] # Confirmed C is definitely for colourless mana

In [None]:
df[df.manaCost.str.contains("{S}")] # Confirmed S is for "snow" mana

In [None]:
df[df.manaCost.str.contains("{S}")] # Confirmed S is for "snow" mana

In [None]:
df[df.manaCost.str.contains("{Y}")] # Confirmed Y and Z is for an 'Un' card
# https://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=9757

In [None]:
s = df[df.name.str.match("Fireball")]['text'].iloc[0]
print(s)

In [None]:
df[df.manaCost.str.contains("{2/W}")] # Look at mana cost

### manaCost columns to create:

*Standard Counts*
- manaCost_W_count: Counts each W once (i.e. {W},{W} = 2)
- manaCost_U_count: Counts each U once (i.e. {U},{U} = 2)
- manaCost_B_count: Counts each B once (i.e. {B},{B} = 2)
- manaCost_R_count: Counts each R once (i.e. {R},{R} = 2)
- manaCost_G_count: Counts each G once (i.e. {G},{G} = 2)
- manaCost_C_count: Counts each Colorless once (i.e. {C},{C} = 2)

- manaCost_Generic_count: Counts each generic mana once (i.e. {4} = 4)

***

*Phyrexian Counts:*

- manaCost_WP_count:	Counts each {W/P} once
- manaCost_UP_count:	Counts each {U/P} once
- manaCost_BP_count:	Counts each {B/P} once
- manaCost_RP_count:	Counts each {R/P} once
- manaCost_GP_count:	Counts each {G/P} once

***

*Hybrid Counts:*

- manaCost_H_WU_count:	Counts each {W/U} hybrid mana once
- manaCost_H_UB_count:	Counts each {U/B} hybrid mana once
- manaCost_H_BR_count:	Counts each {B/R} hybrid mana once
- manaCost_H_RG_count:	Counts each {R/G} hybrid mana once
- manaCost_H_GW_count:	Counts each {G/W} hybrid mana once


- manaCost_H_WB_count:	Counts each {W/B} hybrid mana once
- manaCost_H_UR_count:	Counts each {U/R} hybrid mana once
- manaCost_H_RW_count:	Counts each {R/W} hybrid mana once
- manaCost_H_GU_count:	Counts each {G/U} hybrid mana once
- manaCost_H_BG_count:	Counts each {B/G} hybrid mana once


- manaCost_H_2W_count:	Counts each {2/W} once
- manaCost_H_2B_count:	Counts each {2/B} once
- manaCost_H_2U_count:	Counts each {2/U} once
- manaCost_H_2R_count:	Counts each {2/R} once
- manaCost_H_2G_count:	Counts each {2/G} once

***

*Variable Counts:*

- manaCost_X_count:	Flag if X count (i.e. should only ever be 1)
- manaCost_Y_count:	Flag if Y count (i.e. should only ever be 1)
- manaCost_Z_count:	Flag if Z count (i.e. should only ever be 1)

***

*Other Counts:*

- manaCost_Snow_count:  Counts each S once (i.e. {S} = 1)
- manaCost_HW_count:	Flag if HW count (i.e. should only ever be 1)




In [None]:
# manaCost_W_count test
df.manaCost.str.count('{W}').tolist()


In [None]:
df['manaCost_Generic_count'] = np.where(df.manaCost.str.contains('\{[\d]+?\}'),df.manaCost.str.extract('\{([\d]+?)\}',expand=False),0)
df['manaCost_W_count'] = df.manaCost.str.count('{W}')
df['manaCost_U_count'] = df.manaCost.str.count('{U}')
df['manaCost_B_count'] = df.manaCost.str.count('{B}')
df['manaCost_R_count'] = df.manaCost.str.count('{R}')
df['manaCost_G_count'] = df.manaCost.str.count('{G}')
df['manaCost_C_count'] = df.manaCost.str.count('{C}')
df['manaCost_WP_count'] = df.manaCost.str.count('{W/P}')
df['manaCost_UP_count'] = df.manaCost.str.count('{B/P}')
df['manaCost_BP_count'] = df.manaCost.str.count('{U/P}')
df['manaCost_RP_count'] = df.manaCost.str.count('{R/P}')
df['manaCost_GP_count'] = df.manaCost.str.count('{G/P}')
df['manaCost_H_WU_count'] = df.manaCost.str.count('{W/U}')
df['manaCost_H_UB_count'] = df.manaCost.str.count('{U/B}')
df['manaCost_H_BR_count'] = df.manaCost.str.count('{B/R}')
df['manaCost_H_RG_count'] = df.manaCost.str.count('{R/G}')
df['manaCost_H_GW_count'] = df.manaCost.str.count('{G/W}')
df['manaCost_H_WB_count'] = df.manaCost.str.count('{W/B}')
df['manaCost_H_UR_count'] = df.manaCost.str.count('{U/R}')
df['manaCost_H_BG_count'] = df.manaCost.str.count('{B/G}')
df['manaCost_H_RW_count'] = df.manaCost.str.count('{R/W}')
df['manaCost_H_GU_count'] = df.manaCost.str.count('{G/U}')
df['manaCost_H_2W_count'] = df.manaCost.str.count('{2/W}')
df['manaCost_H_2U_count'] = df.manaCost.str.count('{2/U}')
df['manaCost_H_2B_count'] = df.manaCost.str.count('{2/B}')
df['manaCost_H_2R_count'] = df.manaCost.str.count('{2/R}')
df['manaCost_H_2G_count'] = df.manaCost.str.count('{2/G}')
df['manaCost_X_count'] = df.manaCost.str.count('{X}')
df['manaCost_Y_count'] = df.manaCost.str.count('{Y}')
df['manaCost_Z_count'] = df.manaCost.str.count('{Z}')
df['manaCost_Snow_count'] = df.manaCost.str.count('{S}')
df['manaCost_HW_count'] = df.manaCost.str.count('{HW}')



In [None]:
df[df.manaCost.str.contains('\{[\d]+?\}')]['manaCost']

In [None]:
?np.where

In [None]:
df['manaCost_Generic_count'] = np.where(df.manaCost.str.contains('\{[\d]+?\}'),df.manaCost.str.extract('\{([\d]+?)\}',expand=False),0)

In [None]:
df[['manaCost','manaCost_Generic_count']]

# Break up keyword columns

Want to build a function that does this since new key words are added every set, as opposed to mana symbolds which are only changed every 3-5 years.

In [None]:
df[df.keywords_NA==0]['keywords'].str.contains('\{').sum() # Check that there are no bracers in keywords

In [None]:
df['keywords_NA'] = df.keywords.isna()*1

In [None]:
df.keywords.fillna('{none}',inplace=True)

In [None]:
df.keywords.str.split(",").tolist()

In [None]:
all_keywords = df.keywords.str.split(",").tolist()
unique_keywords = []

for i in all_keywords:
    for j in i:
        if j != '{none}':
            j.capitalize()
            unique_keywords.append(j)
unique_keywords = set(unique_keywords)
unique_keywords = list(unique_keywords)
unique_keywords.sort()

for keyword in unique_keywords:
    col_name = 'keyword_' + keyword.replace(' ','_')
    df[col_name] = df.keywords.str.contains(keyword) * 1


In [None]:
print(unique_keywords)

In [None]:
df[df.keywords.str.contains('Adamant')]['name'].tolist()

In [None]:
df[df.keyword_Adamant == 1]['name'].tolist()

In [None]:
df['keywords_count'] = [len(i) for i in df.keywords.str.split(',').tolist()] * np.where(df.keywords_NA,0,1)

In [None]:
df['keywords_count']

In [None]:
df.loc[55968]

# Effects / Non-keyword abilities

Common spell effects to consider and capture:
<br><br>

**Typically on spells**
<br><br>
* Draw n cards / draw a card *done*
* Discard n cards (differentiate between target player and the words by themselves) *done*
* Destroy target 'type' (Noting that non-land means all types != land) *done*
* Destroy all 'type' - limit flag to creatures as this is generally the effect you care about most *done*
* Exile target 'type' (Noting that non-land means all types != land); In most cases effectively the same as destroy *roll into logic for destroy* *done*
* n damage to target ... (Damage is only relevant to players, creatures, and planeswalkers) *done - replaced by damaage effects*
<br><br>

**Typically on creatures / artifacts / enchantments:**
* Enter the battlefield (can be a advantage or disadvantage effect)
* Activated ability (i.e. format of {cost}:{effect})

<br><br>

**Will leave the below for the future after trying some initial modelling with features so far**
* Scry n cards (scry means look at n cards, and choose to put them on top or bottom of your library)
* Pump power or toughness / reduce power or toughness ... target creature / multiple creatures
* Target creature gains a keyword
* Copy
* Return from graveyard to hand
* Return from graveyard to battlefield
* Counter target 'type' spell
* Gain life / lose life
<br><br>



In [None]:
df['text_NA'] = df.text.isna()*1
df.text.fillna('{none}',inplace=True)

## Drawing cards
Don't try and consider special circumstances. 
Consider just: 
* words 'draw and card' appearing together.
* number of cards (if can readily extract from the text)

In [None]:
df[(df.text.str.contains('draw.*?cards',case=False))][['name','text']]

In [None]:
# Other effects
# Drawing cards
df_draw_cards = df.text.str.extract('[Dd]raw(?!\s[Ss]tep)\s(.*?)card?')
df_draw_cards.rename({0:'extract_text'},axis=1,inplace=True)
df_draw_cards.fillna(0,inplace=True)

cond = [
    df_draw_cards['extract_text'].str[0] == 'a',
    df_draw_cards['extract_text'].str[0:3] == 'two',
    df_draw_cards['extract_text'].str[0:5] == 'three',
    df_draw_cards['extract_text'].str[0:4] == 'four',
    df_draw_cards['extract_text'].str[0:4] == 'five',
    df_draw_cards['extract_text'].str[0:3] == 'six',
    df_draw_cards['extract_text'].str[0:5] == 'seven',
    df_draw_cards['extract_text'].str[0:5] == 'eight',
    df_draw_cards['extract_text'].str[0:4] == 'nine',
    df_draw_cards['extract_text'].str[0:5] == 'half X',
    df_draw_cards['extract_text'].str[0:0] == 'X'
]

output = [1,2,3,4,5,6,7,8,9,15,20]

df['effect_draw_cards'] = np.select(cond,output,default=0)

In [None]:
df_draw_cards.value_counts()

In [None]:
df_draw_cards.rename({0:'extract_text'},axis=1,inplace=True) # Rename columns to something meaningful

In [None]:
cond = [
    df_draw_cards['extract_text'].str[0] == 'a',
    df_draw_cards['extract_text'].str[0:3] == 'two',
    df_draw_cards['extract_text'].str[0:5] == 'three',
    df_draw_cards['extract_text'].str[0:4] == 'four',
    df_draw_cards['extract_text'].str[0:4] == 'five',
    df_draw_cards['extract_text'].str[0:3] == 'six',
    df_draw_cards['extract_text'].str[0:5] == 'seven',
    df_draw_cards['extract_text'].str[0:5] == 'eight',
    df_draw_cards['extract_text'].str[0:4] == 'nine',
    df_draw_cards['extract_text'].str[0:5] == 'half X',
    df_draw_cards['extract_text'].str[0:0] == 'X'
]

output = [1,2,3,4,5,6,7,8,9,15,20]

df['effect_draw_cards'] = np.select(cond,output,default=0)

In [None]:
df['effect_draw_cards'].value_counts()

In [None]:
df[df.text.str.contains('half X')][['name','text']]# Confirm half X likely relates to Hydroid Krasis

In [None]:
df[df.text.str.contains('draw X cards')][['name','text']]

## Discarding cards
* Care about when an effect forces you to discard a card (generally a drawback)
* Care about when an effect forces target player (typically an opponent) to discard cards (generally a positive
* Loot ability: = 'draw a card, then discard a card'

In [None]:
df[(df.text.str.contains('[Dd]iscard\s.*?card?',case=False))][['name','text','keywords']]

In [None]:
# Beneficial discards (i.e. apply to opponent / target player - since generally you choose your opponent, unless you're drawing cards)

df[(df.text.str.contains('[Tt]arget\s(opponent|player).*[Dd]iscards\s.*?card?',case=False))][['name','text','keywords']]

In [None]:
# Beneficial discards (i.e. apply to opponent / target player - since generally you choose your opponent, unless you're drawing cards)

df_extract = (df.text.str.contains('[Tt]arget\s(opponent|player).*[Dd]iscards\s.*?card?',case=False))
df['effect_discard_target_player'] = df_extract*1

In [None]:

df[(df.text.str.contains('Discard\s.*?card?',case=False))][['name','text','keywords']]

In [None]:
df_extract = (df.text.str.contains('Discard\s.*?card?',case=False))
df['effect_discard_own_cards'] = df_extract*1

In [None]:
df['effect_discard_own_cards'].sum()

### Loot ability

In [None]:
# Loot ability check
df[(df.text.str.contains('[Dd]raw a card, then discard a card',case=False))][['name','text']]

In [None]:
# Loot ability dummy variable
df_extract = (df.text.str.contains('[Dd]raw a card, then discard a card',case=False))
df['effect_loot'] = df_extract*1

In [None]:
(df.text.str.contains('[Dd]raw a card, then discard a card',case=False)).sum()

# Destroy target effects

In [None]:
df[(df.text.str.contains('[Dd]estroy.*target.*?',case=False))][['name','text','keywords']]

In [None]:
# Destroying cards on the battlefield
df_extract = df.text.str.extract('[Dd]estroy\s.*target(.*\.)+?\s*?')
df_extract.rename({0:'extract_text'},axis=1,inplace=True)
df_extract.fillna(0,inplace=True)


In [None]:
df_extract.value_counts()

In [None]:
# Destroying artifacts
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*artifact?(\.|\s)')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
# Destroying creatures - not perfect but good for most circumstances, as picks up destruction even when conditions are present
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*creature?(\.|\s)')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
# Destroying lands - need to allow that island is a specified land type, and nonland is commonly used as well
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*?(?<!is)(?<!non)land(?!walk)(?=\.|\s)?')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
# Destroying nonland permanents - equivalent to Creature, Artifact, Enchantment, Planeswalker
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*?nonland(?=\.|\s)?')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
# Destroying Enchantments
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*?enchantment(?=\.|\s)?')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
# Destroying planeswalkers - picks up a couple of cases where there are incidental effects
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*?planeswalker(?=\.|\s)?')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
# Destroying planeswalkers - picks up a couple of cases where there are incidental effects
df_extract = df.text.str.contains('[Dd]estroy\s.*target.*(?<!nonland\s)permanent(?=\.|\s)?')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

In [None]:
[1,3,4]*True

In [None]:
df[(df.effect_destroy_enchantment==1)][['name','text','keywords']]

In [None]:
# Destroying 'all' creatures
df_extract = df.text.str.contains('[Dd]estroy\s.*all.*creatures(?=\.|\s)?')
# df_extract.rename({0:'extract_text'},axis=1,inplace=True)
# df_extract.fillna(0,inplace=True)
df[df_extract][['name','text','keywords']]

## Add exile effects to destroy logic
Why? - they do almost the same thing from an in game perspective as far as most drafts are concerned.
This should be good enough for a first pass of the model

In [None]:
df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*creature?(\.|\s)')
df[df_extract][['name','text','keywords']]

## Damage effects
X damage to target... Creature / Planeswalker / Player

X damage to all...

For now just flag if a card deals damage

In [None]:
df_extract = df.text.str.contains('(deals)+\s[\dX]*.*(?!combat\s)(damage)\sto(?!\syou)')
#('deals(.*)damage\sto(.*)target(.*)')
df[df_extract][['name','text','keywords']]

In [None]:
df_extract = df.text.str.extract('deals\s(.*)damage\sto(\s|\sany\s)target(.*)(creature.*|player.*|planeswalker.*)')
#df_extract.rename({0:'extract_text'},axis=1,inplace=True)
df_extract.fillna(0,inplace=True)

## Counterspell Effects
* Often a characteristics of some instants

In [None]:
df_extract = df.text.str.contains('[Cc]ounter.*spell')
#('deals(.*)damage\sto(.*)target(.*)')
df[df_extract][['name','text','keywords']]

## Enter the battlefield triggers


In [None]:
df_extract = df.text.str.contains('[Ee]nter(s)?\sthe\sbattlefield')
#('deals(.*)damage\sto(.*)target(.*)')
df[df_extract][['name','text','keywords']]

In [None]:
# Sacrificing after an ETB trigger is a regular cost
df_extract = df.text.str.contains('[Ee]nter(s)?\sthe\sbattlefield.*[Ss]acrifice\sit')
#('deals(.*)damage\sto(.*)target(.*)')
df[df_extract][['name','text','keywords']]

## Has an activated ability
Activated abilities are always of the form {cost}:{effect}

In [None]:
df_extract = df.text.str.contains('.*:.*')
df[df_extract][['name','text','keywords']]

In [None]:
df_extract = df.text.str.count('.*:.*')
df_extract.value_counts()

In [None]:
df[df.text.str.count('.*:.*')>4][['name','text','keywords']]

# Efficiency metrics

Some form of efficiency metrics are necessary when judging cards against each other, because in almost all circumstances two cards that are identical, apart from their mana cost, the card with the lower mana cost will be better.

This is needed as it is often a key element of what makes a card more powerful than another. In general the same effect on an Instant is stronger than on a Sorcery, so we would expect that Instants on average are ranked higher than sorceries for a given effect.

For example: 
* The card *Ancestral Recall*, an instant, draws 3 cards for {U}
* The card *Divination*, a sorcery, draws 2 cards for {2}{U}

*Ancestral Recall* is restricted or banned in key formats due to it's power, *Divination* is a baseline level of card drawing.


***
To set up a mana effiency metric we need to set up counts. The obvious ones are:

1. Power
2. Toughness
3. Power + Toughness
4. Count of keywords
5. Count of effects (maybe weight draw a card stronger than most - since it's particularly good)
6. Count of activated abilities (these offer flexiblity)
7. Count of words

In [None]:
df.power.fillna('{none}',inplace=True)
df_extract = df.power.str.contains('\D(?<![{noe}])')

df[df_extract][['name','text','convertedManaCost','power','toughness']]

# These are cases where card text defines the power and toughness
# These will be treated as if they are power / toughness = max(convertedManaCost / 2,1)

In [None]:
df.power.fillna('{none}',inplace=True)
df['power_clean'] = 0
#df.power.str.contains('\D(?<![{noe}])')
df['power_clean'] = np.where(df.power.str.contains('\D(?<![{noe}])'),1,0)
df['power_clean'] = [max(i/2,1) for i in df['convertedManaCost'].tolist()] * np.float64(df['power_clean'])
df['power_clean'] = np.where(df['power_clean']==0,df['power'],df['power_clean'])
df['power_clean'] = np.where(df['power_clean']=='{none}',0,df['power_clean'])

In [None]:

df[df.power.str.contains('\D(?<![{noe}])')][['name','convertedManaCost','power','power_clean','toughness','toughness_clean']] 

In [None]:
df['power_clean'].value_counts()

In [None]:

df[df.toughness.str.contains('\D(?<![{noe}])')][['name','convertedManaCost','power','power_clean','toughness','toughness_clean']] 

In [None]:
df.toughness.fillna('{none}',inplace=True)
df['toughness_clean'] = 0

df['toughness_clean'] = np.where(df.toughness.str.contains('\D(?<![{noe}])'),1,0)
df['toughness_clean'] = [max(i/2,1) for i in df['convertedManaCost'].tolist()] * np.float64(df['toughness_clean'])
df['toughness_clean'] = np.where(df['toughness_clean']==0,df['toughness'],df['toughness_clean'])
df['toughness_clean'] = np.where(df['toughness_clean']=='{none}',0,df['toughness_clean'])



In [None]:
df['toughness_clean'].value_counts()

In [None]:
df['power_plus_toughness'] = np.float64(df['power_clean']) + np.float64(df['toughness_clean'])
df['power_plus_toughness']  = np.float64(df['power_plus_toughness'])

In [None]:
df['power_plus_toughness'].value_counts()
# Note cross checked for reasons that 10,093 > 9,913 however couldn't identify any issues on first pass.
# Something to investigate downt the track.

In [None]:
def sum_columns_starting_with(df, col_name_str):
    col_name_match_len = len(col_name_str)
    col_list = [i for i in df.columns.tolist() if str(i)[0:col_name_match_len] == col_name_str]
    sum_values = []
    temp_list = []
    for i in col_list:
        if sum_values==[]:
            sum_values = df[i].tolist()
        else:
            sum_values = [a+b for a,b in zip(sum_values, df[i])]
    return sum_values

df['keyword_count'] = sum_columns_starting_with(df,'keyword_')
df['effect_count'] = sum_columns_starting_with(df,'effect_')

In [None]:
a = df['keyword_Addendum'].tolist()
b = df['keyword_Affinity'].tolist()
c = df['keyword_Affinity'].tolist()
d = df['keyword_Affinity'].tolist()

col_list = [a,b,c,d]
sum_values = []

for i in col_list:
    if sum_values == []:
        sum_values = a
    else:
        sum_values = [a+b for a,b in zip(sum_values, i)]

sum_values

In [None]:
def max_columns_starting_with(df, col_name_str):
    col_name_match_len = len(col_name_str)
    col_list = [i for i in df.columns.tolist() if str(i)[0:col_name_match_len] == col_name_str]
    max_values = []
    temp_list = []
    for i in col_list:
        if max_values==[]:
            max_values = df[i].tolist()
        else:
            max_values = [max(a,b) for a,b in zip(max_values, df[i])]
    return max_values

In [None]:
df[(df.name=='Black Lotus')][[ 
'convertedManaCost',
'power_clean',
 'toughness_clean',
 'power_plus_toughness',
 'keyword_count',
 'effect_count',
 'efficiency_power',
 'efficiency_toughness',
 'efficiency_p_plus_t',
 'efficiency_keywords',
 'efficiency_effects',
 'efficiency_max',]
].iloc[0]

In [None]:
df[(df.name=='Shivan Dragon')][[ 
'convertedManaCost',
'power_clean',
 'toughness_clean',
 'power_plus_toughness',
 'keyword_count',
 'effect_count',
 'efficiency_power',
 'efficiency_toughness',
 'efficiency_p_plus_t',
 'efficiency_keywords',
 'efficiency_effects',
 'efficiency_max',]
].iloc[0]

In [None]:
df[(df.name=='Kjeldoran Outpost')][[ 
'convertedManaCost',
'power_clean',
 'toughness_clean',
 'power_plus_toughness',
 'keyword_count',
 'effect_count',
 'efficiency_power',
 'efficiency_toughness',
 'efficiency_p_plus_t',
 'efficiency_keywords',
 'efficiency_effects',
 'efficiency_max',]
].iloc[0]

In [None]:
# Find ways to exclude land from effiency metrics, when convertedManaCost is zero.
df[(df.convertedManaCost==0) & ~(df['type'].str.contains("Land"))][['type']].value_counts()