In [343]:
import nltk
import requests
import time
from glob import glob
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

%matplotlib inline

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luizreis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Downloading Data

In [274]:
csv = pd.read_csv("./conjunto-treinamento-classificador.csv")

## Feature Extraction

In [275]:
def encode_url_to_file(url): 
    return re.sub("\/", "-", url)
def get_html_text(url): 
    page = requests.get(url)
    with open("./train-set/" + encode_url_to_file(url) + ".html", 'w' , encoding='utf-8') as outfile:
        outfile.write(page.text)
    soup = BeautifulSoup(page.content)
    return soup.get_text()

def get_text_from_link(link): 
    html = open(link, "r")
    soup = BeautifulSoup(html)
    return soup.get_text()

In [276]:
en_stopwords = set(stopwords.words("english"))

def extract_features(text):     
    data = re.sub("[^$0-9a-zA-Z]", " ", text).lower()
    words = data.split() 
    return [w for w in words]#if not w in en_stopwords]

In [278]:
bag = csv["urls"].apply(get_html_text)

In [None]:
links = sorted(glob('./train-set/*.html'), key=os.path.getmtime)
bag = [get_text_from_link(url) for url in links]

In [279]:
bag = [' '.join(extract_features(b)) for b in bag]

## Preprocessing

## Pipeline

In [355]:
def train(clf, data, target):
    pipe_clf = Pipeline([
        ('vect', CountVectorizer(analyzer = "word",
                                 tokenizer = None,    
                                 preprocessor = None,
                                 stop_words = None)),
        ('tfidf', TfidfTransformer()),
        ('clf', clf),
    ])
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_features': [500, 1000, 5000],
        'tfidf__use_idf': (True, False),
        #'clf__alpha': (1e-2, 1e-3),
    }
    
    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
    gs_clf = GridSearchCV(pipe_clf, parameters, cv=10, iid=False, n_jobs=-1)
    fit = gs_clf.fit(data, target)
    return fit
    

In [356]:
clfs = [SVC(), MultinomialNB(), SGDClassifier(), LogisticRegression(), tree.DecisionTreeClassifier(),
        MLPClassifier(max_iter = 100)]

In [360]:
import time
def metrics(clf):
    inicio = time.time()
    clf = train(clf, bag, csv["target"].tolist())
    fim = time.time()
    total_time = fim - inicio
    
    return {'total_time': total_time, 'score': clf.best_score_}

In [361]:
for clf in clfs:
    print(metrics(clf))



{'total_time': 18.841808080673218, 'score': 0.75}
{'total_time': 17.8500018119812, 'score': 0.7428571428571428}
{'total_time': 17.605117797851562, 'score': 0.7714285714285715}




{'total_time': 18.45519208908081, 'score': 0.7785714285714286}
{'total_time': 22.698601007461548, 'score': 0.7714285714285715}
{'total_time': 198.4458749294281, 'score': 0.8142857142857143}




## Test with Baseline

In [287]:
best_clf = train(SGDClassifier(), bag, csv["target"].tolist())

{'mean_fit_time': array([0.30427692, 0.71721034, 0.32515771, 0.85170457, 0.37788184,
       0.9419024 , 0.30712836, 0.82965112, 0.36275465, 0.77472875,
       0.32078393, 1.00573723]), 'std_fit_time': array([0.01812821, 0.0374786 , 0.02662867, 0.12338611, 0.06040145,
       0.13529152, 0.01985099, 0.09328169, 0.08114682, 0.03887426,
       0.03941748, 0.30012678]), 'mean_score_time': array([0.06023498, 0.12078373, 0.06855712, 0.14945488, 0.08552215,
       0.14219847, 0.07266779, 0.13606987, 0.06990435, 0.13034248,
       0.06929793, 0.158882  ]), 'std_score_time': array([0.02917191, 0.06339383, 0.03133872, 0.0671659 , 0.05027318,
       0.07148588, 0.04350101, 0.06668991, 0.03742425, 0.06927677,
       0.03275784, 0.07929271]), 'param_tfidf__use_idf': masked_array(data=[True, True, True, True, True, True, False, False,
                   False, False, False, False],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False

In [310]:
links = glob('./Paginas/PaginasBaseline/*')
baseline_data = [get_text_from_link(link) for link in links[:10]]

In [312]:
baseline_data[0][0]

'./Paginas/PaginasBaseline/www.wizardscupboard.com441.html'

In [289]:
data = [' '.join(extract_features(data)) for data in baseline_data]

In [291]:
result = best_clf.predict(data)

In [301]:
relevants = [r for r in result if r == 1]
df = []
df.append(dict(zip(['Total de Paginas', 'Relevantes', 'Ratio'], [len(result), len(relevants), len(relevants) / len(result)])))
pd.DataFrame(df)

Unnamed: 0,Ratio,Relevantes,Total de Paginas
0,0.341582,2733,8001


In [335]:
def extract_from_any_link(link, soup):
    try:
        if 'capefeargames' in link:
            return extract_capefeargames(soup)
        if 'cardkingdom' in link:
            return extract_cardkingdom (soup)
        if 'mtgotraders' in link:
            return extract_mtgotraders(soup)
        if 'starcitygames' in link:
            return extract_starcitygames(soup)
        if 'wizardscupboard' in link:
            return extract_wizardscupboard(soup)
        if 'abugames' in link:
            return extractor_abugames(soup)
        if 'scryfall' in link:
            return extract_scryfall(soup)
        if 'mtgmintcards' in link:
            return extract_mtgmintcards(soup)
    except:
        return None
    return None

In [336]:
for v in list(zip(links, result)):
    if v[1] == 0:
        continue
    html = open(v[0], "r")
    soup = BeautifulSoup(html)
    print(extract_from_any_link(v[0], soup))

None
{'name': 'Bake into a Pie *Foil*', 'desc': '', 'price': '$0.02', 'infos': {'Set:\xa0': 'Throne of Eldraine', 'Mana Cost:': '2BlackBlack', 'Card Type:': 'Instant', 'Description:': 'Destroy target creature. Create a Food token. (It\'s an artifact with "2, Tap, Sacrifice this artifact: You gain 3 life.")'}}
None
None
None
None
None
None
{'name': "Player's Choice Sleeves 60", 'price': '$5.00', 'infos': {}}
None
None
None
None
None
{'name': 'Modern Horizons: Wrenn and Six', 'desc': '[+1]: Return up to one target land card from your graveyard to your hand.\n[-1]: Wrenn and Six deals 1 damage to any target.\n[-7]: You get an emblem with "Instant and sorcery cards in your graveyard have retrace."\n|3| \n Pro Tip!\n\nWrenn and Six aren\'t just making an impact on Modern - they see play in Legacy, too. Try Wrenn and Six in your Life from the Loam deck, or use it to recur your creature-lands in Jund.', 'price': '$99.99', 'infos': {'Edition': 'Modern Horizons', 'Type': 'Legendary Planeswalker

None
None
None
None
{'name': 'Aether Revolt Booster Pack', 'price': '$4.25', 'infos': {}}
None
None
{'name': 'Amplifire *Foil*', 'desc': '', 'price': '$0.34', 'infos': {'Set:\xa0': 'Ravnica Allegiance', 'Mana Cost:': '2RedRed', 'Card Type:': 'Creature - Elemental', 'Description:': "At the beginning of your upkeep, reveal cards from the top of your library until you reveal a creature card. Until your next turn, Amplifire's base power becomes twice that card's power and its base toughness becomes twice that card's toughness. Put the revealed cards on the bottom of your library in a random order.", 'Qty:': '5'}}
{'Titulo': "Healer's Headdress\n            {2}", 'Set': 'Fifth Dawn (5DN)', 'Categoria': 'English', 'Raridade': ['Common'], 'Descrição': 'Equipped creature gets +0/+2 and has “{T}: Prevent the next 1 damage that would be dealt to any target this turn.”\n{W}{W}: Attach Healer’s Headdress to target creature you control.\nEquip {1} ({1}: Attach to target creature you control. Equip 

KeyboardInterrupt: 

In [362]:
links = glob('./Paginas/PaginasHeuristica/*')
heuristica_data = [get_text_from_link(link) for link in links ]

In [363]:
data = [' '.join(extract_features(data)) for data in heuristica_data]

In [364]:
result_h = best_clf.predict(data)

In [365]:
relevants = [r for r in result_h if r == 1]
df = []
df.append(dict(zip(['Total de Paginas', 'Relevantes', 'Ratio'], [len(result_h), len(relevants), len(relevants) / len(result_h)])))
pd.DataFrame(df)

Unnamed: 0,Ratio,Relevantes,Total de Paginas
0,0.697757,4885,7001


# Content Extraction

## Capefeargames

In [323]:
urls = ["https://www.capefeargames.com/catalog/magic_singles-throne_of_eldraine/oko_thief_of_crowns/606112",
"https://www.capefeargames.com/buylist/magic_singles-masters_editions-modern_masters_2017/scalding_tarn/439123",
"https://www.capefeargames.com/buylist/magic_singles-core_set_2020/chandra_awakened_inferno/601947",
"https://www.capefeargames.com/catalog/magic_singles-throne_of_eldraine/the_circle_of_loyalty/606155",
"https://www.capefeargames.com/catalog/magic_singles-ravnica_allegiance-ravnica_allegiance/bedevil__foil/593030",
"https://www.capefeargames.com/catalog/magic_singles-core_set_2019/arcades_the_strategist/554263",
"https://www.capefeargames.com/catalog/magic_singles-core_set_2019/goblin_instigator__foil/552313",
"https://www.capefeargames.com/catalog/magic_singles-dominaria/jhoira_weatherlight_captain__foil/520393",
"https://www.capefeargames.com/catalog/magic_singles-dominaria/knight_of_grace__foil/520593",
"https://www.capefeargames.com/catalog/magic_singles-dominaria/karn_scion_of_urza/520483"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [324]:
def extract_capefeargames(soup):
    root_div = soup.find("div", class_="product-info")
    title = root_div.find('h1').text
    price_span = root_div.find('span', class_='price')
    price = re.sub('[\s+]', '', price_span.text) if price_span != None else ''
    infos_box = root_div.find_all('div', class_='info-box')
    description = infos_box[1].find('p').text
    extra_infos = infos_box[2].find('p')
    infos = {}
    key = ""
    value = ""
    for child in extra_infos.children:
        if child.name == None:
            continue
        if key == "": 
            key = child.text
        else:
            value = value + child.text + " "
        if child.name == "br":
            infos[key] = value.strip()
            key = ""
            value = ""
    return {"name": title, "desc": description, "price": price, "infos": infos}

In [325]:
for url in urls: 
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_capefeargames(soup))

{'name': 'Oko, Thief of Crowns', 'desc': '\nNo description for this product.\n', 'price': '', 'infos': {'Color': 'Multi-Color', 'Card Text': '[+2]: Create a Food Token.[+1]: Target artifact or creature loses all abilities and becomes a green Elk creature with base power and toughness 3/3.[-5]: Exchange control of target artifact or creature you control and target creature an opponent controls with power 3 or less.', 'Rarity': 'M', 'Cost': '1GU', 'Pow/Tgh': '4', 'Card Type': 'Legendary Planeswalker - Oko', 'Artist': 'Yongjae Choi', 'Name': 'Oko Thief of Crowns', 'Finish': 'Regular', 'Card Number': '197/269', 'Set Name': 'Throne of Eldraine', 'SetTag': '', 'CM15': 'ELD'}}
{'name': 'Scalding Tarn', 'desc': '\nNo description for this product.\n', 'price': '$50.00/$62.50', 'infos': {'Color': 'Land', 'Card Text': 'T Pay 1 life Sacrifice Scalding Tarn: Search your library for an Island or Mountain card and put it onto the battlefield. Then shuffle your library.', 'Rarity': 'R', 'Cost': '', 'P

## Mtgotraders

In [88]:
urls = ["https://www.mtgotraders.com/store/M20_Act_of_Treason.html",
"https://www.mtgotraders.com/store/M20_Ajani_Strength_of_the_Pride.html",
"https://www.mtgotraders.com/store/M20_Angelic_Gift.html",
"https://www.mtgotraders.com/store/M20_Barony_Vampire.html",
"https://www.mtgotraders.com/store/M20_Barkhide_Troll.html",
"https://www.mtgotraders.com/store/GRN_Assassins_Trophy.html",
"https://www.mtgotraders.com/store/GRN_Assure_Assemble.html",
"https://www.mtgotraders.com/store/GRN_Bartizan_Bats.html",
"https://www.mtgotraders.com/store/XLN_Angraths_Marauders.html",
"https://www.mtgotraders.com/store/XLN_Axis_of_Mortality.html"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [103]:
def get_text_from_contents(contents):
    text = ''
    for content in contents:
        if content.name == 'img':
            text = text + content.get('alt')
        elif content.name == None:
            text = text + content
        else:
            text = text + get_text_from_contents(content.contents)
    return text
def extract_mtgotraders(soup):
    root_div = soup.find('div', class_='product-info')
    name = root_div.find('h1').text
    price = root_div.find('span', class_='price').text
    dl = root_div.find('div', class_='product-addtocart').find('dl')
    keys = [dt.text for dt in dl.find_all('dt')]
    values = [get_text_from_contents(dd.contents) for dd in dl.find_all('dd')]
    infos = dict(zip(keys, values))
    return {'name': name, 'desc': '', 'price': price, 'infos': infos}

{'name': 'Act of Treason', 'desc': '', 'price': '$0.01', 'infos': {'Set:\xa0': 'Core Set 2020', 'Mana Cost:': '2Red', 'Card Type:': 'Sorcery', 'Description:': 'Gain control of target creature until end of turn. Untap that creature. It gains haste until end of turn. (It can attack and Tap this turn.)', 'Qty:': '30'}}
{'name': 'Ajani, Strength of the Pride', 'desc': '', 'price': '$9.80', 'infos': {'Set:\xa0': 'Core Set 2020', 'Mana Cost:': '2WhiteWhite', 'Card Type:': 'Legendary Planeswalker - Ajani', 'Description:': '+1: You gain life equal to the number of creatures you control plus the number of planeswalkers you control.-2: Create a 2/2 white Cat Soldier creature token named Ajani\'s Pridemate with "Whenever you gain life, put a +1/+1 counter on Ajani\'s Pridemate."0: If you have at least 15 life more than your starting life total, exile Ajani, Strength of the Pride and each artifact and creature your opponents control.', 'Qty:': '7'}}
{'name': 'Angelic Gift', 'desc': '', 'price': '$

In [None]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_mtgotraders(soup))

## Wizardscupboard

In [105]:
urls = ["https://www.wizardscupboard.com/grisly-spectacle-p-41020.html",
"https://www.wizardscupboard.com/jace-architect-thought-p-37824.html",
"https://www.wizardscupboard.com/deathhood-cobra-p-41032.html",
"https://www.wizardscupboard.com/cephalid-broker-p-49589.html",
"https://www.wizardscupboard.com/charmbreaker-devils-p-49665.html",
"https://www.wizardscupboard.com/benevolent-ancestor-p-49557.html",
"https://www.wizardscupboard.com/adderstaff-boggart-p-22490.html",
"https://www.wizardscupboard.com/ajani-goldmane-p-22492.html",
"https://www.wizardscupboard.com/deadeye-quartermaster-p-50094.html",
"https://www.wizardscupboard.com/colossal-dreadmaw-p-50227.html"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [153]:
def extract_wizardscupboard(soup):
    pageHeading = soup.find_all('td', class_='pageHeading')
    name = pageHeading[0].text.strip()
    price = pageHeading[1].text.strip()
    main = soup.find('td', class_='main')
    infos = {}
    key = ''
    for i, child in enumerate(main.children):
        if key != '':
            infos[key] = child
            key = ''
            continue
        if child.name != 'b':
            continue
        if child.text == 'color:':
            key = 'color'
        if child.text == 'rarity:':
            key = 'rarity'
        if child.text == 'card type:':
            key = 'card type'
        if child.text == 'artist(s):':
            key = 'artists'
    return {'name': name, 'price': price, 'infos': infos}

In [156]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_wizardscupboard(soup))

{'name': 'Grisly Spectacle - Jace vs. Vraska', 'price': '$0.15', 'infos': {'color': ' Black', 'rarity': ' Common', 'card type': ' Instant', 'artists': ' Zoltan Boros'}}
{'name': 'Jace, Architect of Thought (JvV) - Duel Decks Magic the Gathering Single', 'price': '$1.25', 'infos': {'color': ' Blue', 'rarity': ' Mythic Rare', 'card type': ' Planeswalker', 'artists': ' '}}
{'name': 'Death-Hood Cobra - Jace vs. Vraska', 'price': '$0.15', 'infos': {'color': ' Green', 'rarity': ' Common', 'card type': ' Creature', 'artists': ' Jason Felix'}}
{'name': 'Cephalid Broker - Iconic Masters', 'price': '$0.20', 'infos': {'color': ' Blue', 'rarity': ' Uncommon', 'card type': ' Creature', 'artists': ' '}}
{'name': 'Charmbreaker Devils - Iconic Masters', 'price': '$0.25', 'infos': {'color': ' Red', 'rarity': ' Rare', 'card type': ' Creature', 'artists': ' '}}
{'name': 'Benevolent Ancestor - Iconic Masters', 'price': '$0.10', 'infos': {'color': ' White', 'rarity': ' Common', 'card type': ' Creature', 'a

## Cardkingdom

In [160]:
urls = ["https://www.cardkingdom.com/mtg/core-set-2020/agonizing-syphon",
"https://www.cardkingdom.com/mtg/core-set-2020/air-elemental",
"https://www.cardkingdom.com/mtg/modern-horizons/alpine-guide",
"https://www.cardkingdom.com/mtg/modern-horizons/altar-of-dementia",
"https://www.cardkingdom.com/mtg/modern-horizons/angel-token",
"https://www.cardkingdom.com/mtg/ultimate-masters/aethersnipe",
"https://www.cardkingdom.com/mtg/ultimate-masters/all-is-dust",
"https://www.cardkingdom.com/mtg/commander-2015/blasted-landscape",
"https://www.cardkingdom.com/mtg/global-series-jiang-yanggu-mu-yanling/island",
"https://www.cardkingdom.com/mtg/commander-2019/akoum-refuge"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [194]:
def extract_cardkingdom(soup): 
    main = soup.find('div', class_='mainWrapper')
    name = soup.find('h1').text
    price = main.find('span', class_='stylePrice').text.strip()
    trs = main.find('div', class_='cardDetailInfo').find('table').find_all('tr')
    infos = {}
    for tr in trs[:len(trs)-1]:
        tds = tr.find_all('td')
        if tds[0].text == 'Cast:':
            infos[tds[0].text[:len(tds[0].text)-1]] = ''.join([c.get('src')[len(c.get('src'))-5] for c in tds[1].contents])
        else:
            infos[tds[0].text[:len(tds[0].text)-1]] = tds[1].text.strip(' \n\t\r')
    desc = trs[len(trs)-1].find('td').text.strip(' \n\t\r')
    return {'name': name, 'desc': desc, 'price': price, 'infos': infos}

In [195]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_cardkingdom(soup))

{'name': 'Core Set 2020: Agonizing Syphon', 'desc': 'Agonizing Syphon deals 3 damage to any target and you gain 3 life.', 'price': '$0.25', 'infos': {'Edition': 'Core Set 2020', 'Type': 'Sorcery', 'Cast': '3b', 'Rarity': 'C'}}
{'name': 'Core Set 2020: Air Elemental', 'desc': 'Flying', 'price': '$0.25', 'infos': {'Edition': 'Core Set 2020', 'Type': 'Creature - Elemental', 'Cast': '3uu', 'Rarity': 'U', 'Pow/Tuf': '4/4'}}
{'name': 'Modern Horizons: Alpine Guide', 'desc': 'When Alpine Guide enters the battlefield, you may search your library for a Mountain card, put that card onto the battlefield tapped, then shuffle your library.\nAlpine Guide attacks each combat if able.\nWhen Alpine Guide leaves the battlefield, sacrifice a Mountain.', 'price': '$0.25', 'infos': {'Edition': 'Modern Horizons', 'Type': 'Snow Creature - Human Scout', 'Cast': '2r', 'Rarity': 'U', 'Pow/Tuf': '3/3'}}
{'name': 'Modern Horizons: Altar of Dementia', 'desc': "Sacrifice a creature: Target player puts a number of c

## Starcitygames

In [198]:
urls = ["http://www.starcitygames.com/catalog/magic_the_gathering/product/1385121",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/1423921",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/7655/43686",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/1264395",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57426/165797",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57204",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57462",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57556",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/6015/27865",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/6086"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [271]:
def get_text_from_value(value): 
    if value.name == None:
        return value.strip()
    if value.name == 'i' and value.get('class') != None and len(value.get('class')) == 3:
        return value.get('class')[1][len(value.get('class')[1])-1]
    else:
        return ''.join([get_text_from_value(c) for c in value.contents])
    
def extract_starcitygames(soup): 
    table = soup.find('table')
    name = table.find('h2').text
    div = table.find('div')
    price = div.find('span', class_='articletext').find('div').text
    parent = div.find('div', class_='card_desc_details').find('strong').parent
    infos = {}
    key = ''
    value = ''
    for i, child in enumerate(parent.children):
        if child.name == 'br' and key != '':
            infos[key] = value.strip()
            key = ''
            value = ''
            continue
        if key != '':
            value = value + ' ' + get_text_from_value(child)
        if child.name == 'strong':
            key = child.text[:len(child.text)-1]
    return {'name': name, 'price': price, 'infos': infos}

In [272]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_starcitygames(soup))

{'name': 'Forest', 'price': 'Price: $0.25', 'infos': {'Card Type': 'Basic Land', 'Subtype': 'Forest', 'Card Text': 'G', 'Oracle Text': 'G', 'Artist': 'Dimitar', 'Rarity': 'Basic Land'}}
{'name': 'Mountain', 'price': 'Price: $0.25', 'infos': {'Card Type': 'Basic Land', 'Subtype': 'Mountain', 'Card Text': 'R', 'Oracle Text': 'R', 'Artist': 'Titus Lunter', 'Rarity': 'Basic Land'}}
{'name': 'Blasted Landscape', 'price': 'Price: $0.99', 'infos': {'Card Type': 'Land', 'Card Text': 'ocT: Add one colorless mana to your mana pool.', 'Oracle Text': 'Tap: Add C.', 'Artist': 'Ciruelo', 'Rarity': 'Uncommon'}}
{'name': 'Brazen Scourge', 'price': 'Price: $0.25', 'infos': {'Card Type': 'Creature', 'Creature Type': 'Gremlin', 'Power/Toughness': '3/3', 'Casting Cost': '1 r r', 'Card Text': 'Haste', 'Oracle Text': 'Haste', 'Flavor Text': 'Driven by an insatiable hunger, gremlins never cease feeding. They can destroy a lifetime of work in just a few moments.', 'Artist': 'Kev Walker', 'Rarity': 'Uncommon'}

## Mtgmintcards

In [None]:
urls = []

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [None]:
def extract_mtgmintcards(url):
    titulo = soup.find('h1',class_='pd-card-name').text
    sets = soup.find('span',{'itemprop':'model'}).text
    categoria = soup.find('span',{'itemprop':"category"}).text
    preço = '$ ' + soup.find('span',{'itemprop':'price'}).text
    divInfo = soup.find('div',class_="col-sm-6",style="margin-top: 4px;")
    raridade = divInfo.find('span',{'itemprop':'model'}).text
    descrição = divInfo.find('span',{'itemprop':'description'}).text
    marca = divInfo.find('span',{'itemprop':"brand"}).text
    
    return infoMtgmintCard.append(dict(zip(['Titulo','Set','Categoria','Preço','Raridade','Marca','Descrição'],[titulo,sets,categoria,preço,raridade,marca,descrição])))


In [None]:
for url in urls:
    Extract_MtgMintCards(url)

In [None]:
data = pd.DataFrame(infoMtgmintCard,columns=['Titulo','Set','Categoria','Preço','Raridade','Marca','Descrição'])
data

## Scryfall

In [327]:
urls = ["https://scryfall.com/card/gk2/60/auger-spree",
"https://scryfall.com/card/gk2/7/archon-of-the-triumvirate",
"https://scryfall.com/card/c19/227/ash-barrens",
"https://scryfall.com/card/mh1/195/cloudshredder-sliver",
"https://scryfall.com/card/c19/227/ash-barrens",
"https://scryfall.com/card/c19/37/anje-falkenrath",
"https://scryfall.com/card/c19/226/akoum-refuge",
"https://scryfall.com/card/eld/112/barge-in",
"https://scryfall.com/card/eld/38/animating-faerie-bring-to-life",
"https://scryfall.com/card/eld/324/alela-artful-provocateur"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [330]:
def extract_scryfall(soup):
    divInfo = soup.find('div',class_="card-text")
    titulo = divInfo.find('h1',class_='card-text-title').text.strip(" \n\t\r")
    descrição = divInfo.find('div',class_='card-text-oracle').text.strip(" \n\t\r")
    sets = soup.find('span',class_='prints-current-set-name').text.strip(" \n\t\r")
    raridadeCategoria = soup.find('span',class_='prints-current-set-details').text.strip(" \n\t\r")
    raridadeCategoria = re.findall("[a-zA-Z0-9]+", raridadeCategoria)
    raridade = raridadeCategoria[1:-1]
    categoria = raridadeCategoria[-1]
    
    return dict(zip(['Titulo','Set','Categoria','Raridade','Descrição'],[titulo,sets,categoria,raridade,descrição]))

In [331]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_scryfall(soup))

{'Titulo': 'Auger Spree\n            {1}{B}{R}', 'Set': 'RNA Guild Kit (GK2)', 'Categoria': 'English', 'Raridade': ['Common'], 'Descrição': 'Target creature gets +4/-4 until end of turn.'}
{'Titulo': 'Archon of the Triumvirate\n            {5}{W}{U}', 'Set': 'RNA Guild Kit (GK2)', 'Categoria': 'English', 'Raridade': ['Rare'], 'Descrição': 'Flying\nWhenever Archon of the Triumvirate attacks, detain up to two target nonland permanents your opponents control. (Until your next turn, those permanents can’t attack or block and their activated abilities can’t be activated.)'}
{'Titulo': 'Ash Barrens', 'Set': 'Commander 2019 (C19)', 'Categoria': 'English', 'Raridade': ['Common'], 'Descrição': '{T}: Add {C}.\nBasic landcycling {1} ({1}, Discard this card: Search your library for a basic land card, reveal it, put it into your hand, then shuffle your library.)'}
{'Titulo': 'Cloudshredder Sliver\n            {R}{W}', 'Set': 'Modern Horizons (MH1)', 'Categoria': 'English', 'Raridade': ['Rare'], 'De

## Abugames

In [332]:
urls = ["https://abugames.com/magic-the-gathering/singles/product-detail/Ainok-Survivalist/Commander-2019/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Vivien's-Crocodile/Core-Set-2020-%2F-M20/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Vivien,-Arkbow-Ranger/Core-Set-2020-%2F-M20/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Zephyr-Charge/Core-Set-2020-%2F-M20/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Bladebrand/Core-Set-2020-%2F-M20/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Bishop-of-Wings-%252D-%2528Promo-Pack%2529/Core-Set-2020-%2F-M20/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Agent-of-Treachery/Core-Set-2020-%2F-M20/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Ash-Barrens/Commander-2019/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Anje's-Ravager/Commander-2019/English",
"https://abugames.com/magic-the-gathering/singles/product-detail/Akoum-Refuge/Commander-2019/English"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [333]:
def extractor_abugames(soup):    
    divInfo = soup.find('div',class_='row first')
    titulo = divInfo.find('h1').text
    sets = divInfo.find('div',class_="col-md-2 col-sm-2 col-xs-4").text
    descrição = soup.find('div',{'id':"originalTextundefined"}).text

    info = soup.find('div',class_="tableHere ng-star-inserted")
    info.text.split()

    raridade = info.text.split()[-1]
    raridade = raridade[0:]

    tipo = info.text.split()[1:4]

    return infoAbu.append(dict(zip(['Titulo','Set','Raridade','Descrição','Tipo'],[titulo,sets,raridade,descrição,tipo])))

In [334]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extractor_abugames(soup))

AttributeError: 'NoneType' object has no attribute 'find'