In [1]:
import streamlit as st
import numpy as np
import pandas as pd
import json
import requests
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder

import plotly.express as px

In [2]:
# Setting - Draft sets to import to support scoring

all_draftaholics_sets = [
    'AER','MM3','AKH','HOU','XLN',
    'IMA','UST','RIX','A25','DOM',
    'BBD','M19','GRN','UMA','RNA',
    'WAR','MH1','M20','ELD','MB1',
    'THB','IKO','CUB','M21','2XM',
    'AKR','ZNR','KLR','CMR']


# Helper functions to display data

def combine_columns_as_str(df,col_list):
    # For each column in a list combine the values for a given row and return as a list of strings
    working_list = []
    
    
    
# Helper functions to set up data

def sum_columns_starting_with(df, col_name_str):
    col_name_match_len = len(col_name_str)
    col_list = [i for i in df.columns.tolist() if str(i)[0:col_name_match_len] == col_name_str]
    sum_values = []
    for i in col_list:
        if sum_values==[]:
            sum_values = df[i].tolist()
        else:
            sum_values = [a+b for a,b in zip(sum_values, df[i])]
    return sum_values

def max_columns_starting_with(df, col_name_str):
    col_name_match_len = len(col_name_str)
    col_list = [i for i in df.columns.tolist() if str(i)[0:col_name_match_len] == col_name_str]
    max_values = []
    for i in col_list:
        if max_values==[]:
            max_values = df[i].tolist()
        else:
            max_values = [max(a,b) for a,b in zip(max_values, df[i])]
    return max_values


def import_card_data_features(card_data_path="./resources/cards.csv"):
    # Import card data, sets up features, and returns a data frame with that information
    # Data sourced from: https://mtgjson.com/api/v5/AllPrintingsCSVFiles.zip
    card_dtypes = {
        'colors':np.object,
        'faceConvertedManaCost':np.object,
        'flavorText': np.object,
        'frameEffects': np.object,
        'leadershipSkills': np.object,
        'name': np.object,
        'text': np.object,
    }
    
    df_base = pd.read_csv(card_data_path,dtype=card_dtypes,low_memory=False)
    
    # Remove key rows which have data we won't use
    df_base = df_base[(df_base.isOnlineOnly == 0)]
    df_base = df_base[(df_base.isOversized == 0)]
    df_base = df_base[(df_base.isPromo == 0)]
    df_base = df_base[~(df_base.layout == 'vanguard')]
    
    
    # Keep fields likely to support data feature build
    df = df_base[[
        'index',
        'id',
        'colorIdentity',
        'colorIndicator',
        'colors',
        'convertedManaCost',
        'faceConvertedManaCost',
        'faceName',
        'flavorText',
        'hand',
        'hasAlternativeDeckLimit',
        'isOnlineOnly',
        'isOversized',
        'isPromo',
        'isReprint',
        'isReserved',
        'isStarter',
        'isTextless',
        'keywords',
        'layout',
        'leadershipSkills',
        'life',
        'loyalty',
        'manaCost',
        'multiverseId',
        'name',
        'number',
        'otherFaceIds',
        'power',
        'printings',
        'rarity',
        'setCode',
        'side',
        'subtypes',
        'supertypes',
        'text',
        'toughness',
        'type',
        'types',
        'uuid',
        'variations',
        'watermark'
    ]].copy()
    
    ############################################################
    ############################################################
    
    # Create unique row per card name / allowing for multiple faces (i.e. names may be duplicated)
    # 'side' needs to be filled in or groupby portion of statement doesn't work properly
    df['side'].fillna('normal',inplace=True)
    df['name_row'] = df.sort_values(by='id',ascending=True).groupby(['name','side']).cumcount() + 1
    df = df[(df['name_row'] == 1)]
    
    # Flag double layout cards
    df['double_layout'] = 1
    df['double_layout'].where(df['layout'].isin(['transform','split','adventure','modal_dfc','flip','aftermath','meld']),0,inplace=True)
    
    
    ############################################################
    ############################################################
    
    
    # Add in mana cost counts
    df['manaCost_NA'] = df.manaCost.isna()*1 # Column to flag NA values for manaCost
    df['manaCost'].fillna('{none}',inplace=True) # Use '{none}' in lower case, since all other manaCost letters in upper case.
    df['manaCost_Generic_count'] = np.where(df.manaCost.str.contains('\{[\d]+?\}'),df.manaCost.str.extract('\{([\d]+?)\}',expand=False),0)
    df['manaCost_W_count'] = df.manaCost.str.count('{W}')
    df['manaCost_U_count'] = df.manaCost.str.count('{U}')
    df['manaCost_B_count'] = df.manaCost.str.count('{B}')
    df['manaCost_R_count'] = df.manaCost.str.count('{R}')
    df['manaCost_G_count'] = df.manaCost.str.count('{G}')
    df['manaCost_C_count'] = df.manaCost.str.count('{C}')
    df['manaCost_WP_count'] = df.manaCost.str.count('{W/P}')
    df['manaCost_UP_count'] = df.manaCost.str.count('{B/P}')
    df['manaCost_BP_count'] = df.manaCost.str.count('{U/P}')
    df['manaCost_RP_count'] = df.manaCost.str.count('{R/P}')
    df['manaCost_GP_count'] = df.manaCost.str.count('{G/P}')
    df['manaCost_H_WU_count'] = df.manaCost.str.count('{W/U}')
    df['manaCost_H_UB_count'] = df.manaCost.str.count('{U/B}')
    df['manaCost_H_BR_count'] = df.manaCost.str.count('{B/R}')
    df['manaCost_H_RG_count'] = df.manaCost.str.count('{R/G}')
    df['manaCost_H_GW_count'] = df.manaCost.str.count('{G/W}')
    df['manaCost_H_WB_count'] = df.manaCost.str.count('{W/B}')
    df['manaCost_H_UR_count'] = df.manaCost.str.count('{U/R}')
    df['manaCost_H_BG_count'] = df.manaCost.str.count('{B/G}')
    df['manaCost_H_RW_count'] = df.manaCost.str.count('{R/W}')
    df['manaCost_H_GU_count'] = df.manaCost.str.count('{G/U}')
    df['manaCost_H_2W_count'] = df.manaCost.str.count('{2/W}')
    df['manaCost_H_2U_count'] = df.manaCost.str.count('{2/U}')
    df['manaCost_H_2B_count'] = df.manaCost.str.count('{2/B}')
    df['manaCost_H_2R_count'] = df.manaCost.str.count('{2/R}')
    df['manaCost_H_2G_count'] = df.manaCost.str.count('{2/G}')
    df['manaCost_X_count'] = df.manaCost.str.count('{X}')
    df['manaCost_Y_count'] = df.manaCost.str.count('{Y}')
    df['manaCost_Z_count'] = df.manaCost.str.count('{Z}')
    df['manaCost_Snow_count'] = df.manaCost.str.count('{S}')
    df['manaCost_HW_count'] = df.manaCost.str.count('{HW}')
    
    ############################################################
    ############################################################
    
    # OneHot Encode all keywords in the data, and add some other info related to keywords
    df['keywords_NA'] = df.keywords.isna()*1
    df.keywords.fillna('{none}',inplace=True)
    df['keywords_count'] = [len(i) for i in df.keywords.str.split(',').tolist()] * np.where(df.keywords_NA,0,1)
    
    
    all_keywords = df.keywords.str.split(",").tolist()
    unique_keywords = []
    
    for i in all_keywords:
        for j in i:
            if j != '{none}':
                j.capitalize()
                unique_keywords.append(j)
    unique_keywords = set(unique_keywords)
    unique_keywords = list(unique_keywords)
    unique_keywords.sort()
    
    for keyword in unique_keywords:
        col_name = 'keyword_' + keyword.replace(' ','_')
        df[col_name] = df.keywords.str.contains(keyword) * 1
    
    ############################################################
    ############################################################
    
    
    # Other effects ############################################################
    
    df['text_NA'] = df.keywords.isna()*1
    df.text.fillna('{none}',inplace=True)
    
    # Drawing cards (generally a benefit)
    df_draw_cards = df.text.str.extract('[Dd]raw(?!\s[Ss]tep)\s(.*?)card?')
    df_draw_cards.rename({0:'extract_text'},axis=1,inplace=True)
    df_draw_cards.fillna(0,inplace=True)
    
    cond = [
        df_draw_cards['extract_text'].str[0] == 'a',
        df_draw_cards['extract_text'].str[0:3] == 'two',
        df_draw_cards['extract_text'].str[0:5] == 'three',
        df_draw_cards['extract_text'].str[0:4] == 'four',
        df_draw_cards['extract_text'].str[0:4] == 'five',
        df_draw_cards['extract_text'].str[0:3] == 'six',
        df_draw_cards['extract_text'].str[0:5] == 'seven',
        df_draw_cards['extract_text'].str[0:5] == 'eight',
        df_draw_cards['extract_text'].str[0:4] == 'nine',
        df_draw_cards['extract_text'].str[0:5] == 'half X',
        df_draw_cards['extract_text'].str[0:0] == 'X'
    ]
    
    output = [1,2,3,4,5,6,7,8,9,15,20]
    
    df['effect_draw_cards'] = np.select(cond,output,default=0)
    
    # Beneficial discards (i.e. apply to opponent / target player - since generally you choose your opponent, unless you're drawing cards)
    df_extract = (df.text.str.contains('[Tt]arget\s(opponent|player).*[Dd]iscards\s.*?card?',case=False))
    df['effect_discard_target_player'] = df_extract*1
    
    # Discard own cards (cost)
    df_extract = (df.text.str.contains('Discard\s.*?card?',case=False))
    df['effect_discard_own_cards'] = df_extract*1
    
    # Loot ability dummy variable
    df_extract = (df.text.str.contains('[Dd]raw a card, then discard a card',case=False))
    df['effect_loot'] = df_extract*1
    
    # Destroy effects / exile effects
    # Note - are just treating destroy and exile as identical effects for now, for the purpose of getting a model working
    #        ideally would split these up, and allow for some more nuance
    # Note - need to go back and check interaction of nonland and permanent to make sure it is handle properly
    df_extract_nonland = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?nonland(?=\.|\s)?')*1
    df_extract_permanent = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*(?<!nonland\s)permanent(?=\.|\s)?')*1
    
    df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*artifact?(\.|\s)')*1
    df['effect_destroy_artifact'] = df_extract + df_extract_nonland + df_extract_permanent
    
    df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*creature?(\.|\s)')*1
    df['effect_destroy_creature'] = df_extract + df_extract_nonland + df_extract_permanent
    
    df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?(?<!is)(?<!non)land(?!walk)(?=\.|\s)?')*1
    df['effect_destroy_land'] = df_extract + df_extract_permanent
    
    df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?enchantment(?=\.|\s)?')*1
    df['effect_destroy_enchantment'] = df_extract + df_extract_nonland + df_extract_permanent
    
    df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*target.*?planeswalker(?=\.|\s)?')*1
    df['effect_destroy_planeswalker'] = df_extract + df_extract_nonland + df_extract_permanent
    
    # Destroying 'all' creatures
    df_extract = df.text.str.contains('([Dd]estroy|[Ee]xile)\s.*all.*creatures(?=\.|\s)?')*1
    df['effect_destroy_all_creatures'] = df_extract
    
    # Deals damage effects
    # Focus on damage to others, and excludes comabat damage to... triggers
    df_extract = df.text.str.contains('(deals)+\s[\dX]*.*(?!combat\s)(damage)\sto(?!\syou)')
    df['effect_deals_damage'] = df_extract*1
    
    # Counter spell effects
    df_extract = df.text.str.contains('[Cc]ounter.*spell')
    df['effect_counter_target_spell'] = df_extract*1
    
    # Enters the battlefield effect
    df_extract = df.text.str.contains('[Ee]nter(s)?\sthe\sbattlefield')
    df['effect_enter_the_battlefield'] = df_extract*1
    
    df_extract = df.text.str.contains('[Ee]nter(s)?\sthe\sbattlefield.*[Ss]acrifice\sit')
    df['effect_enter_the_battlefield_sacrific_it'] = df_extract*1
    
    # Activate ability as an effect
    df_extract = df.text.str.count('.*:.*')
    df['effect_has_activated_ability'] = df_extract*1
    
    ############################################################
    ############################################################
    
    # Set up base lines for efficiency metrics
    
    # Power + Toughness 
    # P+T Clean up power
    df.power.fillna('{none}',inplace=True)
    df['power_clean'] = 0
    df['power_clean'] = np.where(df.power.str.contains('\D(?<![{noe}])'),1,0)
    df['power_clean'] = [max(i/2,1) for i in df['convertedManaCost'].tolist()] * np.float64(df['power_clean'])
    df['power_clean'] = np.where(df['power_clean']==0,df['power'],df['power_clean'])
    df['power_clean'] = np.where(df['power_clean']=='{none}',0,df['power_clean'])
    
    # P+T Clean up toughness
    
    df.toughness.fillna('{none}',inplace=True)
    df['toughness_clean'] = 0
    df['toughness_clean'] = np.where(df.toughness.str.contains('\D(?<![{noe}])'),1,0)
    df['toughness_clean'] = [max(i/2,1) for i in df['convertedManaCost'].tolist()] * np.float64(df['toughness_clean'])
    df['toughness_clean'] = np.where(df['toughness_clean']==0,df['toughness'],df['toughness_clean'])
    df['toughness_clean'] = np.where(df['toughness_clean']=='{none}',0,df['toughness_clean'])
    
    # P+T Calculation
    df['power_plus_toughness'] = np.float64(df['power_clean']) + np.float64(df['toughness_clean'])
    df['power_plus_toughness']  = np.float64(df['power_plus_toughness'])
    
    # Count keywords and effects
    
    df['keyword_count'] = sum_columns_starting_with(df,'keyword_')
    df['effect_count'] = sum_columns_starting_with(df,'effect_')
    
    # Calculate effieciency ratings
    df['efficiency_power'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['power_clean'])/df['convertedManaCost'],0)
    df['efficiency_toughness'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['toughness_clean'])/df['convertedManaCost'],0)
    df['efficiency_p_plus_t'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['power_plus_toughness'])/df['convertedManaCost'],0)
    df['efficiency_keywords'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['keyword_count'])/df['convertedManaCost'],0)
    df['efficiency_effects'] = np.where(df['convertedManaCost'].gt(0),np.float64(df['effect_count'])/df['convertedManaCost'],0)
   
    df['efficiency_power'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_power'])
    df['efficiency_toughness'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_toughness'])
    df['efficiency_p_plus_t'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_p_plus_t'])
    df['efficiency_keywords'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_keywords'])
    df['efficiency_effects'] = np.where((df.convertedManaCost==0) & ~(df['type'].str.contains("Land")),12,df['efficiency_effects'])
    
    df['efficiency_max'] = max_columns_starting_with(df,'efficiency_')
    
    # Set index to speed up name merge later on
    df['name_1'] = df['name']
    df.set_index('name_1',inplace=True)
    
    print("Loaded card features.")
    
    return df


def get_draft_scores(set_list):
    # This function gets the draft scores from the draftaholicsanonymous site
    # based on an overarching list of sets (typicall using the standard 3 lettter code used to
    # define Magic:the Gathering sets.)
    # It only needs to be run when you want to update the scores you have on file.
    for i in set_list:
        req_path = "https://apps.draftaholicsanonymous.com/p1p1/" + i + "/results?ajax"
        results = requests.get(req_path)
        
        if results.status_code != 200:
            print(f'Was unable to retrieve data for {i}')
        else:
            results_json = json.loads(results.text)
            output_path = './draft-scores/scores_' + i + '.txt'
            with open(output_path,'w') as outfile:
                json.dump(results_json['data'],outfile)


def import_card_draft_scores(set_list,draft_score_file_path='./draft-scores/'):
    # Note - only need to run get draft scores if we want to refresh data, otherwise
    # assume to work from the underlying files (so can just load)

    def load_draft_scores_to_pandas(set_list,draft_score_file_path='./draft-scores/'):
        # This function loads the files generated by get_draft_scores into a pandas data frame.
        first_load = True
        for i in set_list:
            if first_load == True:
                first_load = False
                load_path = draft_score_file_path + '/scores_' + i + '.txt' 
                df = pd.read_json(load_path)
                print(f"loaded {i} : {df['id'].count()}")
            else:
                load_path = draft_score_file_path + '/scores_' + i + '.txt' 
                df_next = pd.read_json(load_path)
                df = df.append(df_next,ignore_index=True)
                print(f"loaded {i} : {df_next['id'].count()} {df['id'].count()}")
        return df


    # get_draft_scores(all_draftaholics_sets)

    # This function adds score transforms into the pandas dataframe.
    
    df_scores = load_draft_scores_to_pandas(set_list,draft_score_file_path)
    
    # Some different elo score conversions
    
    df_scores['elo_log'] = np.log(df_scores['elo']) # Log conversion of elo
    df_scores['elo_range_all'] = df_scores['elo'].max() - df_scores['elo'].min()
    
    # Score relative to overall list of cards
    # Note - Adding 1 to the top, and 2 to the denominator to present any score being precisely 0 or 1
    df_scores['elo_relative_all'] = (df_scores['elo'] - df_scores['elo'].min() + 1) / (df_scores['elo_range_all'] + 2) 
    
    # Score relative to all cards in a given set
    
    elo_range_set = (df_scores.groupby('set_name')['elo'].max() - df_scores.groupby('set_name')['elo'].min()).reset_index()
    elo_range_set.rename({'elo':'elo_range_set'},axis=1,inplace=True)
    df_scores = df_scores.merge(elo_range_set,how='left',on='set_name')
    
    # Set relative scores for all cards in a given set
    elo_range_min = df_scores.groupby('set_name')['elo'].min().reset_index() 
    elo_range_min.rename({'elo':'elo_set_min'},axis=1,inplace=True)
    df_scores = df_scores.merge(elo_range_min,how='left',on='set_name')
    df_scores['elo_relative_set'] = (df_scores['elo'] - df_scores['elo_set_min'] + 1) / (df_scores['elo_range_set'] + 2) 
        
    # Drop unwanted columns for analysis
    columns_to_drop = [
        # Items from website that aren't necessary
        'image_small',
        'image',
        'image_large',
        'back_image_small',
        'back_image',
        'exclude_from_p1p1',
        # Items to exclude as placeholders while building data
        'elo_range_all',
        'elo_range_set',
        'elo_set_min',
        ]
    
    df_scores.drop(columns_to_drop,axis=1,inplace=True)
    
    # Create the unique scores as well
    df_scores.back_name.fillna('{none}',inplace=True) # Do this to support combining draft scores later
    df_scores_unique = df_scores.groupby(['name','back_name'])[['elo','elo_log','elo_relative_all','elo_relative_set']].mean().reset_index()
    df_scores_unique.rename({'name':'front_name'},axis=1,inplace=True)
    df_scores_unique['name'] = np.where(df_scores_unique.back_name=='{none}',df_scores_unique.front_name,df_scores_unique.front_name + ' // ' + df_scores_unique.back_name)
    df_scores_unique['name_1'] = df_scores_unique['name']
    df_scores_unique.set_index('name_1',inplace=True)
    
    print ("Loaded draft scores.")
    
    return df_scores, df_scores_unique

def add_scores_to_features(df_card_features, df_scores_unique):
    df = df_card_features.merge(df_scores_unique,on='name_1',how='left').copy()
    df.drop('name_y',axis=1,inplace=True)
    df.rename({'name_x':'name'},axis=1,inplace=True)
    print("Merged scores and features")
    return df

# Load up all data for initial work

df_card_features = import_card_data_features()
df_card_scores, df_scores_unique = import_card_draft_scores(all_draftaholics_sets)
df_raw = add_scores_to_features(df_card_features,df_scores_unique)





2021-01-06 16:12:49.877 INFO    numexpr.utils: NumExpr defaulting to 8 threads.
  return func(self, *args, **kwargs)


Loaded card features.
loaded AER : 184
loaded MM3 : 249 433
loaded AKH : 249 682
loaded HOU : 184 866
loaded XLN : 259 1125
loaded IMA : 249 1374
loaded UST : 214 1588
loaded RIX : 191 1779
loaded A25 : 249 2028
loaded DOM : 249 2277
loaded BBD : 249 2526
loaded M19 : 293 2819
loaded GRN : 254 3073
loaded UMA : 254 3327
loaded RNA : 254 3581
loaded WAR : 249 3830
loaded MH1 : 254 4084
loaded M20 : 260 4344
loaded ELD : 249 4593
loaded MB1 : 1694 6287
loaded THB : 249 6536
loaded IKO : 259 6795
loaded CUB : 555 7350
loaded M21 : 258 7608
loaded 2XM : 330 7938
loaded AKR : 303 8241
loaded ZNR : 265 8506
loaded KLR : 286 8792
loaded CMR : 361 9153
Loaded draft scores.
Merged scores and features


# Function that combines contents of columns into a string, this let's it be included in search boxes more easily.

In [3]:
def combine_columns_as_str(df,col_list,seperator="|"):
    # For each column in a list combine the values for a given row and return as a list of strings
    series_to_combine=[]
    num_series = len(col_list)

    for col in col_list:
        series_to_combine.append(df[col].tolist())
    
    num_rows = len(series_to_combine[0])
    
    working_list = []
    for i in range(0,num_rows):  
        my_str = ''
        for j in range(0,num_series):
                my_str = my_str + str(series_to_combine[j][i]) + seperator
        working_list.append(my_str[:-1])

    return working_list

In [4]:
df_card_features[['name','setCode']].reset_index()

Unnamed: 0,name_1,name,setCode
0,Abundance,Abundance,10E
1,Academy Researchers,Academy Researchers,10E
2,Adarkar Wastes,Adarkar Wastes,10E
3,Afflict,Afflict,10E
4,Aggressive Urge,Aggressive Urge,10E
...,...,...,...
21366,"Zagras, Thief of Heartbeats","Zagras, Thief of Heartbeats",ZNR
21367,"Zareth San, the Trickster","Zareth San, the Trickster",ZNR
21368,Zof Consumption // Zof Bloodbog,Zof Consumption // Zof Bloodbog,ZNR
21369,Zof Consumption // Zof Bloodbog,Zof Consumption // Zof Bloodbog,ZNR


In [12]:
df_card_features['name'].tolist()

['Abundance',
 'Academy Researchers',
 'Adarkar Wastes',
 'Afflict',
 'Aggressive Urge',
 'Agonizing Memories',
 'Air Elemental',
 'Ambassador Laquatus',
 'Anaba Bodyguard',
 "Ancestor's Chosen",
 'Angel of Mercy',
 "Angel's Feather",
 'Angelic Blessing',
 'Angelic Chorus',
 'Angelic Wall',
 'Arcane Teachings',
 'Arcanis the Omnipotent',
 'Ascendant Evincar',
 'Assassinate',
 'Aura Graft',
 'Aura of Silence',
 'Avatar of Might',
 'Aven Cloudchaser',
 'Aven Fisher',
 'Aven Windreader',
 'Ballista Squad',
 'Bandage',
 'Battlefield Forge',
 'Beacon of Destruction',
 'Beacon of Immortality',
 'Beacon of Unrest',
 'Benalish Knight',
 'Birds of Paradise',
 'Blanchwood Armor',
 'Blaze',
 'Bloodfire Colossus',
 'Bloodrock Cyclops',
 'Bog Wraith',
 'Bogardan Firefiend',
 'Boomerang',
 'Bottle Gnomes',
 'Brushland',
 'Cancel',
 'Canopy Spider',
 'Caves of Koilos',
 'Cephalid Constable',
 'Chimeric Staff',
 'Cho-Manno, Revolutionary',
 'Chromatic Star',
 'Citanul Flute',
 'Civic Wayfinder',
 'Clo

In [5]:
output_list = combine_columns_as_str(df_card_features,['name','colors','setCode'],seperator=';')


In [6]:
output_list[20000]

'Uktabi Kong;G;UNH'

# Function that checks number of cards with multiple elo scores and returns them as a list

In [7]:
df = df_card_scores.name.value_counts().reset_index()
df.rename({'index':'name','name':'count'},axis=1,inplace=True)
df_target_cards = df[df['count'].apply(lambda x: True if x > 1 else False)]
list_target_cards = combine_columns_as_str(df_target_cards,['name','count'])

In [8]:
print(list_target_cards[0:10])
print(len(list_target_cards))

['Evolving Wilds|8', 'Duress|7', 'Plummet|7', 'Shock|7', 'Negate|7', 'Crash Through|6', 'Battle-Rattle Shaman|6', 'Pacifism|6', 'Opt|6', 'Act of Treason|6']
1894


In [9]:
def cards_with_gt1_elo(df_card_scores):
    df = df_card_scores.name.value_counts().reset_index().copy()
    df.rename({'index':'name','name':'count'},axis=1,inplace=True)
    df_target_cards = df[df['count'].apply(lambda x: True if x > 1 else False)]
    list_target_cards = combine_columns_as_str(df_target_cards,['name','count'])
    return list_target_cards

In [10]:
test_list = cards_with_gt1_elo(df_card_scores)

In [11]:
print(test_list[0:10])
print(len(test_list))

['Evolving Wilds|8', 'Duress|7', 'Plummet|7', 'Shock|7', 'Negate|7', 'Crash Through|6', 'Battle-Rattle Shaman|6', 'Pacifism|6', 'Opt|6', 'Act of Treason|6']
1894


In [231]:
def column_string_match(df,match_string):
    match_list = [i for i in df.columns.tolist() if match_string.upper() in i.upper()]
    return match_list

In [238]:
test_list = column_string_match(df_card_scores,'elo')

In [239]:
test_list

['elo', 'elo_log', 'elo_relative_all', 'elo_relative_set']

In [17]:
s = 'abc'
s.upper()

'ABC'

# Generate scores for training, validation and test

In [44]:

all_draftaholics_sets = [
    'AER','MM3','AKH','HOU','XLN',
    'IMA','UST','RIX','A25','DOM',
    'BBD','M19','GRN','UMA','RNA',
    'WAR','MH1','M20','ELD','MB1',
    'THB','IKO','CUB','M21','2XM',
    'AKR','ZNR','KLR','CMR']

dict_setname_code = {
    'Mystery Booster':'MB1',
    'MTG Arena Draft Cube':'CUB',
    'Commander Legends':'CMR',
    'Double Masters':'2XM',
    'Amonkhet Remastered':'AKR',
    'Core Set 2019':'M19',
    'Kaladesh Remastered':'KLR',
    'Zendikar Rising':'ZNR',
    'Core Set 2020':'M20',
    'Ikoria: Lair of Behemoths':'IKO',
    'Ixalan':'XLN',
    'Core Set 2021':'M21',
    'Ravnica Allegiance':'RNA',
    'Ultimate Masters':'UMA',
    'Guilds of Ravnica':'GRN',
    'Modern Horizons':'MH1',
    'Masters 25':'A25',
    'Amonkhet':'AKH',
    'Battlebond':'BBD',
    'Throne of Eldraine':'ELD',
    'Theros Beyond Death':'THB',
    'Dominaria':'DOM',
    'Iconic Masters':'IMA',
    'War of the Spark':'WAR',
    'Modern Masters 2017 Edition':'MM3',
    'Unstable':'UST',
    'Rivals of Ixalan':'RIX',
    'Hour of Devastation':'HOU',
    'Aether Revolt':'AER',
    }

dict_code_setname={
    'MB1': 'Mystery Booster',
    'CUB': 'MTG Arena Draft Cube',
    'CMR': 'Commander Legends',
    '2XM': 'Double Masters',
    'AKR': 'Amonkhet Remastered',
    'M19': 'Core Set 2019',
    'KLR': 'Kaladesh Remastered',
    'ZNR': 'Zendikar Rising',
    'M20': 'Core Set 2020',
    'IKO': 'Ikoria Lair of Behemoths',
    'XLN': 'Ixalan',
    'M21': 'Core Set 2021',
    'RNA': 'Ravnica Allegiance',
    'UMA': 'Ultimate Masters',
    'GRN': 'Guilds of Ravnica',
    'MH1': 'Modern Horizons',
    'A25': 'Masters 25',
    'AKH': 'Amonkhet',
    'BBD': 'Battlebond',
    'ELD': 'Throne of Eldraine',
    'THB': 'Theros Beyond Death',
    'DOM': 'Dominaria',
    'IMA': 'Iconic Masters',
    'WAR': 'War of the Spark',
    'MM3': 'Modern Mastersß 2017 Edition',
    'UST': 'Unstable',
    'RIX': 'Rivals of Ixalan',
    'HOU': 'Hour of Devastation',
    'AER': 'Aether Revolt',
    }

In [240]:

def model_generate_training_scores(df_card_scores,validation_perc=0,testing_set_list=['ZNR'],random_state=None):
    # If validation_perc set to 0 returns 2 data sets (training and test)
    # If validation_perc set between 0 and 1 returns 3 sets (training, validaiton, test)
    
    # Drop all references to elo_relative_all - they need to be recalculated for model training
    df = df_card_scores.copy()
    df.drop('elo_relative_all',axis=1,inplace=True)
    
    # Separate out testing scores
    test_flag = df.set_name.apply(lambda x: True if dict_setname_code[x] in testing_set_list else False )
    test_score_raw = df[test_flag==True].copy()
    test_score_raw['elo_range_all'] = test_score_raw['elo'].max() - test_score_raw['elo'].min()
    test_score_raw['elo_relative_all'] = (test_score_raw['elo'] - test_score_raw['elo'].min() + 1) / (test_score_raw['elo_range_all'] + 2) 
    test_score_raw.drop('elo_range_all',axis=1,inplace=True)
    testing_scores = test_score_raw.copy()
    
    # Exclude test data
    df = df[~(test_flag)].copy()
    
    # Split scores
    if validation_perc == 0:
        train_score_raw = df
    else:
        val_score_raw = df.sample(frac=validation_perc,random_state=random_state)
        train_score_raw = df[df.id.apply(lambda x: False if x in val_score_raw['id'].tolist() else True)]
        
    # Update ELO_Relative_All
    train_score_raw = train_score_raw.copy()
    train_score_raw['elo_range_all'] = train_score_raw['elo'].max() - train_score_raw['elo'].min()
    train_score_raw['elo_relative_all'] = (train_score_raw['elo'] - train_score_raw['elo'].min() + 1) / (train_score_raw['elo_range_all'] + 2) 
    train_score_raw.drop('elo_range_all',axis=1,inplace=True)
    training_scores = train_score_raw.copy()
    
    if validation_perc > 0:
        val_score_raw = val_score_raw.copy()
        val_score_raw['elo_range_all'] = val_score_raw['elo'].max() - val_score_raw['elo'].min()
        val_score_raw['elo_relative_all'] = (val_score_raw['elo'] - val_score_raw['elo'].min() + 1) / (val_score_raw['elo_range_all'] + 2) 
        val_score_raw.drop('elo_range_all',axis=1,inplace=True)
        validation_scores = val_score_raw.copy()
    
    if validation_perc == 0:
        return training_scores, testing_scores
    else:
        return training_scores, validation_scores, testing_scores

# Function to generate unique scores within training / test / validation data
def create_unique_scores(df_input):
    # Create the unique scores as well
    df_input.back_name.fillna('{none}',inplace=True) # Do this to support combining draft scores later
    df_scores_unique = df_input.groupby(['name','back_name'])[['elo','elo_log','elo_relative_all','elo_relative_set']].mean().reset_index().copy()
    df_scores_unique.rename({'name':'front_name'},axis=1,inplace=True)
    df_scores_unique['name'] = np.where(df_scores_unique.back_name=='{none}',df_scores_unique.front_name,df_scores_unique.front_name + ' // ' + df_scores_unique.back_name)
    df_scores_unique['name_1'] = df_scores_unique['name']
    df_scores_unique.set_index('name_1',inplace=True)
    df_output = df_scores_unique.copy()
    return df_output    

# Function to link training / test / validation data to scores
def link_scores_to_features(features, scores):
    df = features.merge(scores,on='name_1',how='inner').copy()
    df.drop('name_y',axis=1,inplace=True)
    df.rename({'name_x':'name'},axis=1,inplace=True)
    return df

def model_cleanup_columns(scores):
    # Drops all columns that will definitely not be used in model build
    df = scores.copy()
    cols_to_drop = [
        'index','id','flavorText','hand','hasAlternativeDeckLimit','isOnlineOnly','isOversized',
        'isPromo','isReprint','isReserved','isStarter','isTextless','keywords','layout',
        'leadershipSkills','life','manaCost','multiverseId','number','otherFaceIds','printings',
        'setCode','side','uuid','variations','watermark','name_row',
        ]
    
    for col in cols_to_drop:
        df.drop(col,axis=1,inplace=True)
        
    return df

def model_gen_x_y(scores,y_target,cols_to_drop=[],cols_to_keep=[]):
    df = scores.copy()
    y = df[y_target]
    elo_cols = column_string_match(df,'elo')
    
    # Always drop alternate elo definitions
    for col in elo_cols:
        df.drop(col,axis=1,inplace=True)
    
    # Specific columns to be excluded
    if cols_to_drop:
        for col in cols_to_drop:
             df.drop(col,axis=1,inplace=True)
    
    # Specific columns to keep
    if cols_to_keep:
        df = df[cols_to_keep].copy()
    
    x = df.copy()
    
    return x,y

In [94]:
testing_set_list=['ZNR','AER','HOU']
a = df_card_scores.set_name.apply(lambda x: True if dict_setname_code[x] in testing_set_list else False )
df_card_scores[~(a)].head()

Unnamed: 0,id,set_id,set_name,name,back_name,color,elo,rank,rarity,elo_log,elo_relative_all,elo_relative_set
184,1117,5,Modern Masters 2017 Edition,Liliana of the Veil,{none},black,2149,1,mythic,7.672758,0.847644,0.998963
185,1143,5,Modern Masters 2017 Edition,Olivia Voldaren,{none},gold,2136,2,mythic,7.66669,0.839251,0.985477
186,1194,5,Modern Masters 2017 Edition,Snapcaster Mage,{none},blue,2129,3,mythic,7.663408,0.834732,0.978216
187,1026,5,Modern Masters 2017 Edition,Bonfire of the Damned,{none},red,2096,4,mythic,7.647786,0.813428,0.943983
188,1049,5,Modern Masters 2017 Edition,Damnation,{none},black,2064,5,rare,7.632401,0.79277,0.910788


In [116]:
df_card_scores[['id']].nunique()

id    9153
dtype: int64

In [246]:
a,b,c = generate_training_scores(df_card_scores,validation_perc=0.1,random_state=854)
a[a.name=='Duress']

Unnamed: 0,id,set_id,set_name,name,back_name,color,elo,rank,rarity,elo_log,elo_relative_set,elo_relative_all
1096,1776,8,Ixalan,Duress,{none},black,1308,231,common,7.176255,0.257403,0.304713
1364,1518,7,Iconic Masters,Duress,{none},black,1258,240,common,7.137278,0.09481,0.272434
2761,3235,14,Core Set 2019,Duress,{none},black,1418,236,common,7.257003,0.271095,0.375726
4321,4326,18,Core Set 2020,Duress,{none},black,1313,238,common,7.18007,0.175579,0.307941
5883,5329,20,Mystery Booster,Duress,{none},black,1546,1291,common,7.343426,0.420824,0.45836
7197,7193,23,MTG Arena Draft Cube,Duress,{none},black,1482,403,common,7.301148,0.389463,0.417043
7595,7678,24,Core Set 2021,Duress,{none},black,1183,246,common,7.075809,0.140075,0.224015


In [196]:
a.nunique()

id                  7999
set_id                28
set_name              28
name                6053
back_name             17
color                  7
elo                 1073
rank                1574
rarity                 4
elo_log             1073
elo_relative_set    5630
elo_relative_all    1073
dtype: int64

In [247]:
a,b,c = create_unique_scores(a),create_unique_scores(b),create_unique_scores(c)
a[a.name=='Duress']

Unnamed: 0_level_0,front_name,back_name,elo,elo_log,elo_relative_all,elo_relative_set,name
name_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Duress,Duress,{none},1358.285714,7.210141,0.337176,0.249893,Duress


In [198]:
a,b,c = 

front_name          6053
back_name             17
elo                 1539
elo_log             2527
elo_relative_all    1727
elo_relative_set    5130
name                6053
dtype: int64

In [142]:
df_card_features.head()

Unnamed: 0_level_0,index,id,colorIdentity,colorIndicator,colors,convertedManaCost,faceConvertedManaCost,faceName,flavorText,hand,...,toughness_clean,power_plus_toughness,keyword_count,effect_count,efficiency_power,efficiency_toughness,efficiency_p_plus_t,efficiency_keywords,efficiency_effects,efficiency_max
name_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abundance,0,1,G,,G,4.0,,,,,...,0,0.0,0,1,0.0,0.0,0.0,0.0,0.25,0.25
Academy Researchers,1,2,U,,U,3.0,,,They brandish their latest theories as warrior...,,...,2,4.0,0,1,0.666667,0.666667,1.333333,0.0,0.333333,1.333333
Adarkar Wastes,2,3,"U,W",,,0.0,,,,,...,0,0.0,0,2,0.0,0.0,0.0,0.0,0.0,0.0
Afflict,3,4,B,,B,3.0,,,"One rarely notices a heartbeat, save when it i...",,...,0,0.0,0,1,0.0,0.0,0.0,0.0,0.333333,0.333333
Aggressive Urge,4,5,G,,G,2.0,,,"The power of the wild, concentrated in a singl...",,...,0,0.0,0,1,0.0,0.0,0.0,0.0,0.5,0.5


In [170]:
test[test.name=='Duress']

Unnamed: 0_level_0,front_name,back_name,elo,elo_log,elo_relative_all,elo_relative_set,name
name_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Duress,Duress,{none},1358.285714,7.210141,0.337176,0.249893,Duress


In [248]:
a,b,c = link_scores_to_features(df_card_features,a),link_scores_to_features(df_card_features,b),link_scores_to_features(df_card_features,c),

In [221]:
print(a.shape[0])
print(b.shape[0])
print(c.shape[0]) # Ok that we have extra rows given how we're handling double faced cards (i.e. treating them seperately for model purposes with a common score)

6128
865
301


In [225]:
a.head()

Unnamed: 0_level_0,index,id,colorIdentity,colorIndicator,colors,convertedManaCost,faceConvertedManaCost,faceName,flavorText,hand,...,efficiency_p_plus_t,efficiency_keywords,efficiency_effects,efficiency_max,front_name,back_name,elo,elo_log,elo_relative_all,elo_relative_set
name_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aggressive Urge,4,5,G,,G,2.0,,,"The power of the wild, concentrated in a singl...",,...,0.0,0.0,0.5,0.5,Aggressive Urge,{none},1499.5,7.31225,0.428341,0.397309
Air Elemental,6,7,U,,U,5.0,,,"""The East Wind, an interloper in the dominions...",,...,1.6,0.2,0.0,1.6,Air Elemental,{none},1787.0,7.487163,0.613944,0.602364
Angel of Mercy,13,14,W,,W,5.0,,,Every tear shed is a drop of immortality.,,...,1.2,0.2,0.2,1.2,Angel of Mercy,{none},1522.0,7.32461,0.442866,0.434381
Angelic Chorus,18,19,W,,W,5.0,,,The harmony of the glorious is a dirge to the ...,,...,0.0,0.0,0.2,0.2,Angelic Chorus,{none},1590.0,7.371489,0.486766,0.469432
Assassinate,26,27,B,,B,3.0,,,"""This is how wars are won—not with armies of s...",,...,0.0,0.0,0.333333,0.333333,Assassinate,{none},1649.0,7.407924,0.524855,0.533843


In [249]:
a,b,c = model_cleanup_columns(a),model_cleanup_columns(b),model_cleanup_columns(c)

In [250]:
alist = []
if alist:
    print("list with content")
else:
    print("empty_list")

empty_list


In [251]:
ax,ay = model_gen_x_y(a,'elo')
bx,by = model_gen_x_y(b,'elo')
cx,cy = model_gen_x_y(c,'elo')

In [256]:
print(ax.shape)
print(ay.shape)

(6128, 306)
(6128,)


In [260]:
ay

name_1
Aggressive Urge          1499.5
Air Elemental            1787.0
Angel of Mercy           1522.0
Angelic Chorus           1590.0
Assassinate              1649.0
                          ...  
Wakening Sun's Avatar    1773.0
Waker of the Wilds       1936.0
Wanted Scoundrels        1673.0
Watertrap Weaver         1743.0
Wily Goblin              1311.0
Name: elo, Length: 6128, dtype: float64

# Find a way to reverse and get ordinal encoder settings

In [6]:
dfw = df_raw[['rarity','elo']].copy()

In [7]:
dfw.elo.fillna(0)

name_1
Abundance                             0.0
Academy Researchers                   0.0
Adarkar Wastes                        0.0
Afflict                               0.0
Aggressive Urge                    1499.5
                                    ...  
Zagras, Thief of Heartbeats        2227.0
Zareth San, the Trickster          2280.0
Zof Consumption // Zof Bloodbog    1577.0
Zof Consumption // Zof Bloodbog    1577.0
Zulaport Duelist                   1240.0
Name: elo, Length: 21371, dtype: float64

In [9]:
oenc = OrdinalEncoder()

In [10]:
oenc.fit(dfw)

OrdinalEncoder(cols=['rarity'],
               mapping=[{'col': 'rarity', 'data_type': dtype('O'),
                         'mapping': rare        1
uncommon    2
common      3
mythic      4
NaN        -2
dtype: int64}])

In [16]:
oenc.mapping[0]

{'col': 'rarity',
 'mapping': rare        1
 uncommon    2
 common      3
 mythic      4
 NaN        -2
 dtype: int64,
 'data_type': dtype('O')}

In [25]:
rarity_mapping = [i for i in oenc.mapping if i['col']=='rarity'][0]['mapping']

In [27]:
rarity_mapping.index.tolist()

['rare', 'uncommon', 'common', 'mythic', nan]

In [28]:
rarity_mapping.tolist()

[1, 2, 3, 4, -2]

In [61]:
rarity_dict = {}
key_list = rarity_mapping.index.tolist()
value_list = rarity_mapping.tolist()
rarity_dict = {}
for i,j in zip(key_list,value_list):
    rarity_dict[j]=i


In [62]:
rarity_dict

{1: 'rare', 2: 'uncommon', 3: 'common', 4: 'mythic', -2: nan}

In [68]:
a = list(rarity_dict.keys())
print(a)

[1, 2, 3, 4, -2]


In [71]:
a.sort()
print(a)

[-2, 1, 2, 3, 4]


In [72]:
a


[-2, 1, 2, 3, 4]