In [1]:
# All imports
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import joblib
import requests
from PIL import Image

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\monic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load previously collected data
df = pd.read_csv("DataCleansed.csv",delimiter="|", na_filter = False) #to not treat N/A as Nan
df_url = pd.read_csv("ImageURLs.csv",delimiter="|")

In [4]:
df.head()

Unnamed: 0,Color,Subtype,Text,Type,Loyalty,Power,Toughness,Keyword,Name,ManaValue
0,['White'],['Plains'],({T}: Add {W}.),['Land'],,,,['N/A'],Plains,0.0
1,['Blue'],['Island'],({T}: Add {U}.),['Land'],,,,['N/A'],Island,0.0
2,['Black'],['Swamp'],({T}: Add {B}.),['Land'],,,,['N/A'],Swamp,0.0
3,['Red'],['Mountain'],({T}: Add {R}.),['Land'],,,,['N/A'],Mountain,0.0
4,['Green'],['Forest'],({T}: Add {G}.),['Land'],,,,['N/A'],Forest,0.0


In [5]:
df_url.head()

Unnamed: 0,Name,scryfallId,URL
0,Plains,7ee52536-8cfa-482b-874e-094c0a081894,https://c1.scryfall.com/file/scryfall-cards/la...
1,Island,161aaceb-d0bb-48c5-8bd4-bff321a94b2e,https://c1.scryfall.com/file/scryfall-cards/la...
2,Swamp,4df49e68-cadf-4196-a3f4-ae38579edaeb,https://c1.scryfall.com/file/scryfall-cards/la...
3,Mountain,890037ae-c366-4769-b7f7-7185a1bebca1,https://c1.scryfall.com/file/scryfall-cards/la...
4,Forest,86dae285-a59c-426c-b6cd-3683abea75a3,https://c1.scryfall.com/file/scryfall-cards/la...


In [6]:
# We have less picture urls that we have cards
print(len(df),len(df_url))

23924 23919


In [3]:
# Let's merge both dfs
df = pd.merge(df,df_url,on=["Name"])

In [8]:
df.head()

Unnamed: 0,Color,Subtype,Text,Type,Loyalty,Power,Toughness,Keyword,Name,ManaValue,scryfallId,URL
0,['White'],['Plains'],({T}: Add {W}.),['Land'],,,,['N/A'],Plains,0.0,7ee52536-8cfa-482b-874e-094c0a081894,https://c1.scryfall.com/file/scryfall-cards/la...
1,['Blue'],['Island'],({T}: Add {U}.),['Land'],,,,['N/A'],Island,0.0,161aaceb-d0bb-48c5-8bd4-bff321a94b2e,https://c1.scryfall.com/file/scryfall-cards/la...
2,['Black'],['Swamp'],({T}: Add {B}.),['Land'],,,,['N/A'],Swamp,0.0,4df49e68-cadf-4196-a3f4-ae38579edaeb,https://c1.scryfall.com/file/scryfall-cards/la...
3,['Red'],['Mountain'],({T}: Add {R}.),['Land'],,,,['N/A'],Mountain,0.0,890037ae-c366-4769-b7f7-7185a1bebca1,https://c1.scryfall.com/file/scryfall-cards/la...
4,['Green'],['Forest'],({T}: Add {G}.),['Land'],,,,['N/A'],Forest,0.0,86dae285-a59c-426c-b6cd-3683abea75a3,https://c1.scryfall.com/file/scryfall-cards/la...


In [9]:
# We got rid of the cards for which we did not have picture urls
len(df)

23919

In [4]:
# Transform the columns ['Color','Subtype','Type','Keyword'] from a string that looks like a list to an actual list ('['White']'>>[White])

def col_to_list(col):
    '''
    Transforms all cells in a dataframe column from a string format that looks like a list to an actual list (ex: '['White']'>>[White]).

            Parameters:
                    col (str): Name of the dataframe column.

    '''
    df[col] = df[col].apply(lambda x: x.strip("']['").split("', '"))

for col in ['Color','Subtype','Type','Keyword']:
    col_to_list(col)

In [11]:
df.head()

Unnamed: 0,Color,Subtype,Text,Type,Loyalty,Power,Toughness,Keyword,Name,ManaValue,scryfallId,URL
0,[White],[Plains],({T}: Add {W}.),[Land],,,,[N/A],Plains,0.0,7ee52536-8cfa-482b-874e-094c0a081894,https://c1.scryfall.com/file/scryfall-cards/la...
1,[Blue],[Island],({T}: Add {U}.),[Land],,,,[N/A],Island,0.0,161aaceb-d0bb-48c5-8bd4-bff321a94b2e,https://c1.scryfall.com/file/scryfall-cards/la...
2,[Black],[Swamp],({T}: Add {B}.),[Land],,,,[N/A],Swamp,0.0,4df49e68-cadf-4196-a3f4-ae38579edaeb,https://c1.scryfall.com/file/scryfall-cards/la...
3,[Red],[Mountain],({T}: Add {R}.),[Land],,,,[N/A],Mountain,0.0,890037ae-c366-4769-b7f7-7185a1bebca1,https://c1.scryfall.com/file/scryfall-cards/la...
4,[Green],[Forest],({T}: Add {G}.),[Land],,,,[N/A],Forest,0.0,86dae285-a59c-426c-b6cd-3683abea75a3,https://c1.scryfall.com/file/scryfall-cards/la...


In [5]:
# Create a column with all categorial variables that aren't the column 'Text'

# Add all categorial columns that are type = list
df['Categorical']=df['Subtype']+df['Type']+df['Keyword']

# Convert ['Categorical'] column from list type to string
df['Categorical']=[' '.join(map(str, l)) for l in df['Categorical']]

# Drop ['Subtype'],['Type'],['Keyword'] columns (since they are already in ['Categorical'])
df.drop(['Subtype','Type','Keyword'],axis=1,inplace=True)

# Add the rest of the categorial columns to ['Categorical'] (treat ManaValue as str/categorical because it's a discrete number)
df['Categorical']= df['Categorical']+' '+df['Power']+' '+df['Toughness']+' '+df['Loyalty']+' '+df['ManaValue'].astype(int).astype(str) #float>int to remove .0's

# Drop the columns whose data we just added to ['Categorical']
df.drop(['Power','Toughness','Loyalty','ManaValue'],axis=1,inplace=True)

In [13]:
df.head()

Unnamed: 0,Color,Text,Name,scryfallId,URL,Categorical
0,[White],({T}: Add {W}.),Plains,7ee52536-8cfa-482b-874e-094c0a081894,https://c1.scryfall.com/file/scryfall-cards/la...,Plains Land N/A N/A N/A N/A 0
1,[Blue],({T}: Add {U}.),Island,161aaceb-d0bb-48c5-8bd4-bff321a94b2e,https://c1.scryfall.com/file/scryfall-cards/la...,Island Land N/A N/A N/A N/A 0
2,[Black],({T}: Add {B}.),Swamp,4df49e68-cadf-4196-a3f4-ae38579edaeb,https://c1.scryfall.com/file/scryfall-cards/la...,Swamp Land N/A N/A N/A N/A 0
3,[Red],({T}: Add {R}.),Mountain,890037ae-c366-4769-b7f7-7185a1bebca1,https://c1.scryfall.com/file/scryfall-cards/la...,Mountain Land N/A N/A N/A N/A 0
4,[Green],({T}: Add {G}.),Forest,86dae285-a59c-426c-b6cd-3683abea75a3,https://c1.scryfall.com/file/scryfall-cards/la...,Forest Land N/A N/A N/A N/A 0


In [6]:
# Now let's look at our sample size for each color combination

df['color_sum'] = df['Color'].str.len()
df['color_sum'].value_counts()

1    20197
2     3064
3      573
5       73
4       12
Name: color_sum, dtype: int64

In [7]:
# The sample size is very disproportionate: there are a lot more cards with a single color, that with 2+ colors. 
# Therefore, the best practice is to drop all multi-colored cards
df = df[df['color_sum']==1] 

# We don't need the color_sum column anymore
df.drop('color_sum',axis=1,inplace=True)

In [16]:
# We have 20197 unique cards w/ a single color
len(df)

20197

In [17]:
# Since now we only have 1 color per card, it doesn't make sense for column ['Color'] to be of list type anymore
df['Color'].explode().unique()

array(['White', 'Blue', 'Black', 'Red', 'Green', 'Colorless'],
      dtype=object)

In [8]:
# Let's convert it back to string:
df['Color'] = ['|'.join(map(str, l)) for l in df['Color']]

In [19]:
df['Color'].unique()

array(['White', 'Blue', 'Black', 'Red', 'Green', 'Colorless'],
      dtype=object)

In [9]:
# We won't be needing scryfallId
df.drop(['scryfallId'],axis=1,inplace=True)

In [21]:
df.head()

Unnamed: 0,Color,Text,Name,URL,Categorical
0,White,({T}: Add {W}.),Plains,https://c1.scryfall.com/file/scryfall-cards/la...,Plains Land N/A N/A N/A N/A 0
1,Blue,({T}: Add {U}.),Island,https://c1.scryfall.com/file/scryfall-cards/la...,Island Land N/A N/A N/A N/A 0
2,Black,({T}: Add {B}.),Swamp,https://c1.scryfall.com/file/scryfall-cards/la...,Swamp Land N/A N/A N/A N/A 0
3,Red,({T}: Add {R}.),Mountain,https://c1.scryfall.com/file/scryfall-cards/la...,Mountain Land N/A N/A N/A N/A 0
4,Green,({T}: Add {G}.),Forest,https://c1.scryfall.com/file/scryfall-cards/la...,Forest Land N/A N/A N/A N/A 0


In [10]:
def removeChars(txt):
    '''
    Removes unwanted characters from a string.

            Parameters:
                    txt (srt): String from which we want to remove characters.

            Returns:
                    cleantxt (str): Clean string (after character removal).
    '''
    cleantxt = re.sub(r'\{.*\}', "", txt) # Remove all symbols ({X}) from the card text
    cleantxt = re.sub('[{}?|!|\'|"|#().\n:,;]', "",cleantxt) # Remove unwanted special characters
    return cleantxt

In [11]:
def removeStopWords(txt):
    '''
    Removes unwanted stop words from a string.

            Parameters:
                    txt (srt): String from which we want to remove stop words.

            Returns:
                    cleantxt (str): Clean string (after stop word removal).
    '''
    stop_words = set(stopwords.words('english'))
    stop_words.update(['N/A','may','also','across','among','beside','however','yet','within','None'])
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
    cleantxt = re_stop_words.sub("", txt)
    return cleantxt

In [12]:
#Clean txt 
def txt_clean(df,old_col,new_col):
    df[new_col] = df[old_col].str.lower()
    df[new_col] = df[new_col].apply(removeStopWords)
    df[new_col] = df[new_col].apply(removeChars)

In [13]:
#Clean columns: Text, Name, Categorical
txt_clean(df,'Name','name')
txt_clean(df,'Text','text')
txt_clean(df,'Categorical','categorical')

In [14]:
df['Text'][23917]

'{T}: Add {U}.\n{T}: Target creature loses banding and all "bands with other" abilities until end of turn. Activate only during any upkeep step.'

In [15]:
df['text'][23917]

' target creature loses banding bands  abilities end turn activate upkeep step'

In [16]:
# Drop the columns we dont need anymore
df.drop(['Categorical','Text'],axis=1,inplace=True)

In [29]:
df.head()

Unnamed: 0,Color,Name,URL,name,text,categorical
0,White,Plains,https://c1.scryfall.com/file/scryfall-cards/la...,plains,,plains land 0
1,Blue,Island,https://c1.scryfall.com/file/scryfall-cards/la...,island,,island land 0
2,Black,Swamp,https://c1.scryfall.com/file/scryfall-cards/la...,swamp,,swamp land 0
3,Red,Mountain,https://c1.scryfall.com/file/scryfall-cards/la...,mountain,,mountain land 0
4,Green,Forest,https://c1.scryfall.com/file/scryfall-cards/la...,forest,,forest land 0


In [31]:
# Save ml-ready text data 
df.to_csv("DataReady.csv","|",index=False)

Preparing the image data

In [38]:
def get_pixel_data(df,col_url,col_pk):
    '''
    From the col_url column in df, open every URL as an image, crop and resize it, then convert it to a numpy array and append this data to a dictionary.
    In the end, save the dictionary as a .pkl file.

            Parameters:
                    df (pandas dataframe): Dataframe with card name and image url data.
                    col_url (srt): Name of the url column in dataframe df.
                    col_pk (srt): Name of the primary key column of dataframe df.

    '''
    dict_all = dict()
    dict_all[col_pk] = []
    dict_all['ImageDataCrop'] = []

    for idx, url in enumerate(df['URL']):
        dict_card = dict()
      
        img = Image.open(requests.get(url, stream=True).raw)
        img_crop = img.crop((41, 90, 628, 535))
        img_crop = img_crop.resize((150, 150))
        pix = np.array(img)
        pix_crop = np.array(img_crop)

        dict_all[col_pk].append(df[col_pk][idx])
        dict_all['ImageDataCrop'].append(pix_crop)
    
    joblib.dump(dict_all, 'ImageDataCrop.pkl')

# Call the function: this will take aprox 130min to run and creates a 1.27GB file
get_pixel_data(df,'URL','Name')