In [6]:
import nltk
import requests
from glob import glob
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

%matplotlib inline

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luizreis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Downloading Data

In [8]:
csv = pd.read_csv("./conjunto-treinamento-classificador.csv")

## Feature Extraction

In [9]:
def encode_url_to_file(url): 
    return re.sub("\/", "-", url)
def get_html_text(url): 
    page = requests.get(url)
    with open("./train-set/" + encode_url_to_file(url) + ".html", 'w' , encoding='utf-8') as outfile:
        outfile.write(page.text)
    soup = BeautifulSoup(page.content)
    return soup.get_text()

def get_text_from_link(link): 
    html = open(link, "r")
    soup = BeautifulSoup(html)
    return soup.get_text()

In [10]:
en_stopwords = set(stopwords.words("english"))

def extract_features(text):     
    data = re.sub("[^a-zA-Z]", " ", text).lower()
    words = data.split() 
    return [w for w in words if not w in en_stopwords]

In [11]:
bag = csv["urls"].apply(get_html_text)

In [114]:
links = sorted(glob('./train-set/*.html'), key=os.path.getmtime)
print(len(links))
bag = [get_text_from_link(url) for url in links]

139


KeyboardInterrupt: 

In [109]:
bag = [' '.join(extract_features(b)) for b in bag]

## Preprocessing

## Pipeline

In [110]:
def train(clf, data, target):
    pipe_clf = Pipeline([
        ('vect', CountVectorizer(analyzer = "word",
                                 tokenizer = None,    
                                 preprocessor = None,
                                 stop_words = None)),
        ('tfidf', TfidfTransformer()),
        ('clf', clf),
    ])
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_features': [500, 1000, 5000],
        'tfidf__use_idf': (True, False),
        #'clf__alpha': (1e-2, 1e-3),
    }
    gs_clf = GridSearchCV(pipe_clf, parameters, cv=10, iid=False, n_jobs=-1)
    return gs_clf.fit(data, target)
    

In [111]:
clfs = [SVC(), MultinomialNB(), SGDClassifier(), LogisticRegression(), tree.DecisionTreeClassifier(), MLPClassifier()]

In [113]:
print(len(csv["target"].tolist()))

140


In [91]:

for clf in clfs:
    clf = train(clf, bag, csv["target"].tolist())
    print(clf.best_score_)

ValueError: Found input variables with inconsistent numbers of samples: [139, 160]

## Test with Baseline

In [46]:
links = glob('./Paginas/PaginasBaseline/*')
baseline_data = [get_text_from_link(link) for link in links ]

AttributeError: 'list' object has no attribute 'apply'

In [48]:
data = [' '.join(extract_features(data)) for data in baseline_data]

'mtg booster boxes buy magic gathering booster boxes wizards cupboard cdata var cot loc window location protocol https https secure comodo net trustlogo javascript cot js http www trustlogo com trustlogo javascript cot js document writeln scr ipt language javascript src cot loc type text javascript scr ipt home magic gathering booster boxes account cart contents checkout quick find use keywords find product try theadvanced search categories singles foils setsbooster boxesbooster packsother sealed productspecialty itemsaccessoriesother ccg salternate art cards information blogdragon sculpturescompany detailsshipping returnstwc ramblingsnews archivegradingf q contact us tools sell uslink us articles articlescard combos decks general magic articles author please selectgabriel showersj turnerjustin vizarothe wizard cupboar promotion something special way comes var ans customer id ad f b ca c credit card services buy magic gathering booster boxes wizards cupboard largest selection mtg boost

# Content Extraction

## Capefeargames

In [67]:
urls = ["https://www.capefeargames.com/catalog/magic_singles-throne_of_eldraine/oko_thief_of_crowns/606112",
"https://www.capefeargames.com/buylist/magic_singles-masters_editions-modern_masters_2017/scalding_tarn/439123",
"https://www.capefeargames.com/buylist/magic_singles-core_set_2020/chandra_awakened_inferno/601947",
"https://www.capefeargames.com/catalog/magic_singles-throne_of_eldraine/the_circle_of_loyalty/606155",
"https://www.capefeargames.com/catalog/magic_singles-ravnica_allegiance-ravnica_allegiance/bedevil__foil/593030",
"https://www.capefeargames.com/catalog/magic_singles-core_set_2019/arcades_the_strategist/554263",
"https://www.capefeargames.com/catalog/magic_singles-core_set_2019/goblin_instigator__foil/552313",
"https://www.capefeargames.com/catalog/magic_singles-dominaria/jhoira_weatherlight_captain__foil/520393",
"https://www.capefeargames.com/catalog/magic_singles-dominaria/knight_of_grace__foil/520593",
"https://www.capefeargames.com/catalog/magic_singles-dominaria/karn_scion_of_urza/520483"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [86]:
def extract_capefeargames(soup):
    root_div = soup.find("div", class_="product-info")
    title = root_div.find('h1').text
    price_span = root_div.find('span', class_='price')
    price = re.sub('[\s+]', '', price_span.text) if price_span != None else ''
    infos_box = root_div.find_all('div', class_='info-box')
    description = infos_box[1].find('p').text
    extra_infos = infos_box[2].find('p')
    infos = {}
    key = ""
    value = ""
    for child in extra_infos.children:
        if child.name == None:
            continue
        if key == "": 
            key = child.text
        else:
            value = value + child.text + " "
        if child.name == "br":
            infos[key] = value.strip()
            key = ""
            value = ""
    return {"name": title, "desc": description, "price": price, "infos": infos}

In [87]:
for url in urls: 
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_capefeargames(soup)["price"])


$50.00/$62.50
$8.00/$10.00

$4.95
$3.95

$7.55
$0.56



## Mtgotraders

In [88]:
urls = ["https://www.mtgotraders.com/store/M20_Act_of_Treason.html",
"https://www.mtgotraders.com/store/M20_Ajani_Strength_of_the_Pride.html",
"https://www.mtgotraders.com/store/M20_Angelic_Gift.html",
"https://www.mtgotraders.com/store/M20_Barony_Vampire.html",
"https://www.mtgotraders.com/store/M20_Barkhide_Troll.html",
"https://www.mtgotraders.com/store/GRN_Assassins_Trophy.html",
"https://www.mtgotraders.com/store/GRN_Assure_Assemble.html",
"https://www.mtgotraders.com/store/GRN_Bartizan_Bats.html",
"https://www.mtgotraders.com/store/XLN_Angraths_Marauders.html",
"https://www.mtgotraders.com/store/XLN_Axis_of_Mortality.html"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [103]:
def get_text_from_contents(contents):
    text = ''
    for content in contents:
        if content.name == 'img':
            text = text + content.get('alt')
        elif content.name == None:
            text = text + content
        else:
            text = text + get_text_from_contents(content.contents)
    return text
def extract_mtgotraders(soup):
    root_div = soup.find('div', class_='product-info')
    name = root_div.find('h1').text
    price = root_div.find('span', class_='price').text
    dl = root_div.find('div', class_='product-addtocart').find('dl')
    keys = [dt.text for dt in dl.find_all('dt')]
    values = [get_text_from_contents(dd.contents) for dd in dl.find_all('dd')]
    infos = dict(zip(keys, values))
    return {'name': name, 'desc': '', 'price': price, 'infos': infos}

{'name': 'Act of Treason', 'desc': '', 'price': '$0.01', 'infos': {'Set:\xa0': 'Core Set 2020', 'Mana Cost:': '2Red', 'Card Type:': 'Sorcery', 'Description:': 'Gain control of target creature until end of turn. Untap that creature. It gains haste until end of turn. (It can attack and Tap this turn.)', 'Qty:': '30'}}
{'name': 'Ajani, Strength of the Pride', 'desc': '', 'price': '$9.80', 'infos': {'Set:\xa0': 'Core Set 2020', 'Mana Cost:': '2WhiteWhite', 'Card Type:': 'Legendary Planeswalker - Ajani', 'Description:': '+1: You gain life equal to the number of creatures you control plus the number of planeswalkers you control.-2: Create a 2/2 white Cat Soldier creature token named Ajani\'s Pridemate with "Whenever you gain life, put a +1/+1 counter on Ajani\'s Pridemate."0: If you have at least 15 life more than your starting life total, exile Ajani, Strength of the Pride and each artifact and creature your opponents control.', 'Qty:': '7'}}
{'name': 'Angelic Gift', 'desc': '', 'price': '$

In [None]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_mtgotraders(soup))

## Wizardscupboard

In [105]:
urls = ["https://www.wizardscupboard.com/grisly-spectacle-p-41020.html",
"https://www.wizardscupboard.com/jace-architect-thought-p-37824.html",
"https://www.wizardscupboard.com/deathhood-cobra-p-41032.html",
"https://www.wizardscupboard.com/cephalid-broker-p-49589.html",
"https://www.wizardscupboard.com/charmbreaker-devils-p-49665.html",
"https://www.wizardscupboard.com/benevolent-ancestor-p-49557.html",
"https://www.wizardscupboard.com/adderstaff-boggart-p-22490.html",
"https://www.wizardscupboard.com/ajani-goldmane-p-22492.html",
"https://www.wizardscupboard.com/deadeye-quartermaster-p-50094.html",
"https://www.wizardscupboard.com/colossal-dreadmaw-p-50227.html"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [153]:
def extract_wizardscupboard(soup):
    pageHeading = soup.find_all('td', class_='pageHeading')
    name = pageHeading[0].text.strip()
    price = pageHeading[1].text.strip()
    main = soup.find('td', class_='main')
    infos = {}
    key = ''
    for i, child in enumerate(main.children):
        if key != '':
            infos[key] = child
            key = ''
            continue
        if child.name != 'b':
            continue
        if child.text == 'color:':
            key = 'color'
        if child.text == 'rarity:':
            key = 'rarity'
        if child.text == 'card type:':
            key = 'card type'
        if child.text == 'artist(s):':
            key = 'artists'
    return {'name': name, 'price': price, 'infos': infos}

In [156]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_wizardscupboard(soup))

{'name': 'Grisly Spectacle - Jace vs. Vraska', 'price': '$0.15', 'infos': {'color': ' Black', 'rarity': ' Common', 'card type': ' Instant', 'artists': ' Zoltan Boros'}}
{'name': 'Jace, Architect of Thought (JvV) - Duel Decks Magic the Gathering Single', 'price': '$1.25', 'infos': {'color': ' Blue', 'rarity': ' Mythic Rare', 'card type': ' Planeswalker', 'artists': ' '}}
{'name': 'Death-Hood Cobra - Jace vs. Vraska', 'price': '$0.15', 'infos': {'color': ' Green', 'rarity': ' Common', 'card type': ' Creature', 'artists': ' Jason Felix'}}
{'name': 'Cephalid Broker - Iconic Masters', 'price': '$0.20', 'infos': {'color': ' Blue', 'rarity': ' Uncommon', 'card type': ' Creature', 'artists': ' '}}
{'name': 'Charmbreaker Devils - Iconic Masters', 'price': '$0.25', 'infos': {'color': ' Red', 'rarity': ' Rare', 'card type': ' Creature', 'artists': ' '}}
{'name': 'Benevolent Ancestor - Iconic Masters', 'price': '$0.10', 'infos': {'color': ' White', 'rarity': ' Common', 'card type': ' Creature', 'a

## Cardkingdom

In [160]:
urls = ["https://www.cardkingdom.com/mtg/core-set-2020/agonizing-syphon",
"https://www.cardkingdom.com/mtg/core-set-2020/air-elemental",
"https://www.cardkingdom.com/mtg/modern-horizons/alpine-guide",
"https://www.cardkingdom.com/mtg/modern-horizons/altar-of-dementia",
"https://www.cardkingdom.com/mtg/modern-horizons/angel-token",
"https://www.cardkingdom.com/mtg/ultimate-masters/aethersnipe",
"https://www.cardkingdom.com/mtg/ultimate-masters/all-is-dust",
"https://www.cardkingdom.com/mtg/commander-2015/blasted-landscape",
"https://www.cardkingdom.com/mtg/global-series-jiang-yanggu-mu-yanling/island",
"https://www.cardkingdom.com/mtg/commander-2019/akoum-refuge"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [194]:
def extract_cardkingdom(soup): 
    main = soup.find('div', class_='mainWrapper')
    name = soup.find('h1').text
    price = main.find('span', class_='stylePrice').text.strip()
    trs = main.find('div', class_='cardDetailInfo').find('table').find_all('tr')
    infos = {}
    for tr in trs[:len(trs)-1]:
        tds = tr.find_all('td')
        if tds[0].text == 'Cast:':
            infos[tds[0].text[:len(tds[0].text)-1]] = ''.join([c.get('src')[len(c.get('src'))-5] for c in tds[1].contents])
        else:
            infos[tds[0].text[:len(tds[0].text)-1]] = tds[1].text.strip(' \n\t\r')
    desc = trs[len(trs)-1].find('td').text.strip(' \n\t\r')
    return {'name': name, 'desc': desc, 'price': price, 'infos': infos}

In [195]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_cardkingdom(soup))

{'name': 'Core Set 2020: Agonizing Syphon', 'desc': 'Agonizing Syphon deals 3 damage to any target and you gain 3 life.', 'price': '$0.25', 'infos': {'Edition': 'Core Set 2020', 'Type': 'Sorcery', 'Cast': '3b', 'Rarity': 'C'}}
{'name': 'Core Set 2020: Air Elemental', 'desc': 'Flying', 'price': '$0.25', 'infos': {'Edition': 'Core Set 2020', 'Type': 'Creature - Elemental', 'Cast': '3uu', 'Rarity': 'U', 'Pow/Tuf': '4/4'}}
{'name': 'Modern Horizons: Alpine Guide', 'desc': 'When Alpine Guide enters the battlefield, you may search your library for a Mountain card, put that card onto the battlefield tapped, then shuffle your library.\nAlpine Guide attacks each combat if able.\nWhen Alpine Guide leaves the battlefield, sacrifice a Mountain.', 'price': '$0.25', 'infos': {'Edition': 'Modern Horizons', 'Type': 'Snow Creature - Human Scout', 'Cast': '2r', 'Rarity': 'U', 'Pow/Tuf': '3/3'}}
{'name': 'Modern Horizons: Altar of Dementia', 'desc': "Sacrifice a creature: Target player puts a number of c

## Starcitygames

In [198]:
urls = ["http://www.starcitygames.com/catalog/magic_the_gathering/product/1385121",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/1423921",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/7655/43686",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/1264395",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57426/165797",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57204",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57462",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/57556",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/6015/27865",
"http://www.starcitygames.com/catalog/magic_the_gathering/product/6086"]

urls = [encode_url_to_file(url) + ".html" for url in urls]

In [255]:
def get_text_from_value(value): 
    if value.name == None:
        return value.strip()
    if value.name == 'i' and len(value.get('class')) == 3:
        print(value)
        return value.get('class')[1][len(value.get('class')[1])-1]
    else:
        return ''.join([get_text_from_value(c) for c in value.contents])
    
def extract_starcitygames(soup): 
    table = soup.find('table')
    name = table.find('h2').text
    div = table.find('div')
    price = div.find('span', class_='articletext').find('div').text
    parent = div.find('div', class_='card_desc_details').find('strong').parent
    infos = {}
    key = ''
    value = ''
    for i, child in enumerate(parent.children):
        if child.name == 'br' and key != '':
            infos[key] = value.strip()
            key = ''
            value = ''
            continue
        if key != '':
            value = value + ' ' + get_text_from_value(child)
        if child.name == 'strong':
            key = child.text[:len(child.text)-1]
    return {'name': name, 'price': price, 'infos': infos}

In [256]:
for url in urls:    
    html = open("./train-set/" + url, "r")
    soup = BeautifulSoup(html)
    print(extract_starcitygames(soup))

{'name': 'Forest', 'price': 'Price: $0.25', 'infos': {'Card Type': 'Basic Land', 'Subtype': 'Forest', 'Card Text': 'G', 'Oracle Text': 'G', 'Artist': 'Dimitar', 'Rarity': 'Basic Land'}}
{'name': 'Mountain', 'price': 'Price: $0.25', 'infos': {'Card Type': 'Basic Land', 'Subtype': 'Mountain', 'Card Text': 'R', 'Oracle Text': 'R', 'Artist': 'Titus Lunter', 'Rarity': 'Basic Land'}}
{'name': 'Blasted Landscape', 'price': 'Price: $0.99', 'infos': {'Card Type': 'Land', 'Card Text': 'ocT: Add one colorless mana to your mana pool.', 'Oracle Text': 'Tap: Add C.', 'Artist': 'Ciruelo', 'Rarity': 'Uncommon'}}
<i class="ms ms-1 ms-cost"></i>
<i class="ms ms-r ms-cost"></i>
<i class="ms ms-r ms-cost"></i>
{'name': 'Brazen Scourge', 'price': 'Price: $0.25', 'infos': {'Card Type': 'Creature', 'Creature Type': 'Gremlin', 'Power/Toughness': '3/3', 'Casting Cost': '1 r r', 'Card Text': 'Haste', 'Oracle Text': 'Haste', 'Flavor Text': 'Driven by an insatiable hunger, gremlins never cease feeding. They can d

TypeError: object of type 'NoneType' has no len()