In [2]:
# import packages

import time
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

### MTGO Decklists Scraping

#### Find all Links to Decklists from Main MTGO Page

In [3]:
# Set up Splinter
browser = Browser('chrome')

# URL for all MTGO league results from January 2024
# This could be edited as user input in the future, if this method is performant
url = f"https://www.mtgo.com/decklists/2024/01"
browser.visit(url)

# Parse the website
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [4]:
# close first browser since data is saved
browser.quit()

In [5]:
# Get all decklist items from main page
decklist_items = soup.find_all('li', class_='decklists-item')

# Loop through decklist items to store specific information
decklist_links = []
count = 0
for item in decklist_items:
    event_id = count
    count += 1
    event = item.find('h3').text.split(' ')
    link = item.find('a', class_ = 'decklists-link')['href']
    date = item.find('time')['datetime']
    decklist_links.append({'event id': event_id, 'format': event[0], 'event type': event[1], 'link': 'https://www.mtgo.com'+link, 'date': date})

# Show the results of the scrape as a dataframe
decklist_links_df = pd.DataFrame(decklist_links)

# Filter to just league results
league_links_df = decklist_links_df[decklist_links_df['event type'] == 'League'].reset_index(drop=True)

# Reformat the date to a readable format while preserving the datetime dtype
league_links_df['date'] = pd.to_datetime(league_links_df['date']).dt.date.astype('datetime64[ns]')

# Save data to CSV
league_links_df.to_csv("data/league_information.csv", index=False)

league_links_df


Unnamed: 0,event id,format,event type,link,date
0,3,Legacy,League,https://www.mtgo.com/decklist/legacy-league-20...,2024-01-27
1,4,Modern,League,https://www.mtgo.com/decklist/modern-league-20...,2024-01-27
2,5,Pauper,League,https://www.mtgo.com/decklist/pauper-league-20...,2024-01-27
3,6,Pioneer,League,https://www.mtgo.com/decklist/pioneer-league-2...,2024-01-27
4,7,Standard,League,https://www.mtgo.com/decklist/standard-league-...,2024-01-27
...,...,...,...,...,...
157,318,Modern,League,https://www.mtgo.com/decklist/modern-league-20...,2024-01-01
158,319,Pauper,League,https://www.mtgo.com/decklist/pauper-league-20...,2024-01-01
159,320,Pioneer,League,https://www.mtgo.com/decklist/pioneer-league-2...,2024-01-01
160,321,Standard,League,https://www.mtgo.com/decklist/standard-league-...,2024-01-01


In [6]:
# Confirm the dtypes
league_links_df.dtypes

event id               int64
format                object
event type            object
link                  object
date          datetime64[ns]
dtype: object

#### For each Link, visit and scrape the decklists

In [7]:
# WARNING ------------------------
# Executing this cell will take 30+ minutes to fully collect the data if the loop break is removed.
# Un-comment out the break to limit the calls and test a smaller volume

count = 0 
decklists = []
browser = Browser('chrome')

for index, row in league_links_df.iterrows():

    # Grab the link and event_id from the league links dataframe previously scraped 
    url = row['link']
    event_id = row['event id']

    # Visit the URL, sleeping for 15 seconds as the pages are slow to load
    browser.visit(url)
    time.sleep(10)

    # Get the HTML content
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Process the HTML content and extract data
    decklist_items = soup.find_all('section', class_='decklist')
    deck_count = 0
    for item in decklist_items:
        player = item.find('p', class_ = 'decklist-player').text.split(' ')[0]
        maindeck = item.find('div', class_ = 'decklist-category-columns').find_all('ul', class_=lambda x: x and 'decklist-category-list' in x.split() and 'decklist-sideboard' not in x.split())
        main = []
        for deck in maindeck:
            cards = deck.find_all('a', class_ = 'decklist-card-link')
            for card in cards:
                main.append(card.text.split(' ', 1))
        sideboard = item.find(lambda tag: tag.name == 'ul' and 'decklist-sideboard' in tag.get('class', [])).find_all('li', class_ = 'decklist-category-card')
        side = []
        for deck in sideboard:
            cards = deck.find_all('a', class_ = 'decklist-card-link')
            for card in cards:
                side.append(card.text.split(' ', 1))
        decklists.append({'event id': event_id, 'player': player, 'maindeck': main, 'sideboard': side})
        deck_count += 1
    
    # Status Update
    print(f"{deck_count} decks added for event {event_id}")

    # Check if count is 10, then terminate the loop
    # This is for testing purposes - stops after fetching 10 events
    count += 1
    # if count == 10:
    #    print("Terminating the loop at count 10")
    #    break

# Close the browswer after all data collected
browser.quit()
    
    

    

20 decks added for event 3
25 decks added for event 4
10 decks added for event 5
10 decks added for event 6
5 decks added for event 7
1 decks added for event 8
16 decks added for event 15
25 decks added for event 16
19 decks added for event 17
15 decks added for event 18
17 decks added for event 19
7 decks added for event 20
25 decks added for event 27
25 decks added for event 28
17 decks added for event 29
11 decks added for event 30
11 decks added for event 31
4 decks added for event 32
20 decks added for event 41
25 decks added for event 42
19 decks added for event 43
11 decks added for event 44
13 decks added for event 45
4 decks added for event 46
0 decks added for event 53
25 decks added for event 54
0 decks added for event 55
0 decks added for event 56
14 decks added for event 57
5 decks added for event 58
0 decks added for event 63
0 decks added for event 64
19 decks added for event 65
13 decks added for event 66
7 decks added for event 67
5 decks added for event 68
25 decks ad

In [8]:
# Set the results of the scrape to a dataframe
decklists_df = pd.DataFrame(decklists)

# Save the data as a CSV
decklists_df.to_csv("test_decklists.csv", index=False)

# View the dataframe
decklists_df

Unnamed: 0,event id,player,maindeck,sideboard
0,3,istillhaveeczema,"[[4, Murktide Regent], [4, Orcish Bowmasters],...","[[2, Carpet of Flowers], [2, Dismember], [1, F..."
1,3,lostatsea,"[[4, Lotus Petal], [3, Archon of Cruelty], [2,...","[[4, Dauthi Voidwalker], [1, Faerie Macabre], ..."
2,3,handsomePPZ,"[[1, Leovold, Emissary of Trest], [4, Murktide...","[[2, Carpet of Flowers], [1, Collector Ouphe],..."
3,3,Ark4n,"[[4, Architects of Will], [4, Bloodbraid Marau...","[[1, Collector Ouphe], [2, Endurance], [2, Fae..."
4,3,Blue_Man7,"[[3, Brazen Borrower], [4, Elvish Spirit Guide...","[[2, Brotherhood's End], [3, Endurance], [4, F..."
...,...,...,...,...
2241,322,Jujkata,"[[1, Black Lotus], [1, Mox Jet], [1, Mox Sapph...","[[2, Cut Down], [1, Force of Negation], [4, Le..."
2242,322,Batz13,"[[1, Black Lotus], [1, Bolas's Citadel], [1, L...","[[1, Dismember], [2, Hurkyl's Recall], [4, Ley..."
2243,322,atlante,"[[1, Black Lotus], [1, Lotus Petal], [4, Mishr...","[[3, Containment Priest], [1, Fragmentize], [1..."
2244,322,PRINCEOFPERASIA,"[[1, Black Lotus], [1, Mana Crypt], [1, Mox Em...","[[1, Flusterstorm], [1, Force of Vigor], [4, L..."


#### For each event, Find the total number of each card used

In [9]:
# Maindeck

# Parse all of the maindecks from the event data
maindeck_cards = []
for index, row in decklists_df.iterrows():
    event_id = row['event id']
    maindeck = row['maindeck']
    for card in maindeck:
        maindeck_cards.append({'event id': event_id, 'copies': int(card[0]), 'card name': card[1]})

# Set up a dataframe with this information
maindeck_cards_df = pd.DataFrame(maindeck_cards)

# Some card name cleanup to interact better with the mtg sdk used later
maindeck_cards_df['card name'] = maindeck_cards_df['card name'].str.replace('/', ' // ')
maindeck_cards_df['card name'] = maindeck_cards_df['card name'].str.replace('"Name Sticker"', '_____')

# Clean the data: Group by 'event id' and 'card name', and sum the 'copies'
maindeck_cards_clean_df = maindeck_cards_df.groupby(['event id', 'card name'], as_index=False)['copies'].sum()

maindeck_cards_clean_df.to_csv("data/maindeck_cards.csv", index=False)
maindeck_cards_clean_df 

Unnamed: 0,event id,card name,copies
0,3,Aether Spellbomb,2
1,3,Aether Vial,4
2,3,Aluren,4
3,3,Ancient Tomb,20
4,3,Animate Dead,14
...,...,...,...
22470,322,Vampiric Tutor,2
22471,322,Wasteland,8
22472,322,Watery Grave,1
22473,322,White Plume Adventurer,4


In [10]:
# Sideboard

# Parse all of the sideboards from the event data
sideboard_cards = []
for index, row in decklists_df.iterrows():
    event_id = row['event id']
    sideboard = row['sideboard']
    for card in sideboard:
        sideboard_cards.append({'event id': event_id, 'copies': int(card[0]), 'card name': card[1]})

# Set up a dataframe with this information
sideboard_cards_df = pd.DataFrame(sideboard_cards)

# Some card name cleanup to interact better with the mtg sdk used later
sideboard_cards_df['card name'] = sideboard_cards_df['card name'].str.replace('/', ' // ')
sideboard_cards_df['card name'] = sideboard_cards_df['card name'].str.replace('"Name Sticker"', '_____')

# Clean the data: Group by 'event id' and 'card name', and sum the 'copies'
sideboard_cards_clean_df = sideboard_cards_df.groupby(['event id', 'card name'], as_index=False)['copies'].sum()

sideboard_cards_clean_df.to_csv("data/sideboard_cards.csv", index=False)
sideboard_cards_clean_df 

Unnamed: 0,event id,card name,copies
0,3,Acererak the Archlich,1
1,3,Assassin's Trophy,2
2,3,Back to Basics,3
3,3,Blue Elemental Blast,2
4,3,"Boseiju, Who Endures",1
...,...,...,...
8936,322,Swords to Plowshares,6
8937,322,The Mightstone and Weakstone,1
8938,322,The Tabernacle at Pendrell Vale,4
8939,322,Toxic Deluge,1


See MTG API Datapull.ipynb for next steps, where I use the MTG SDK API to pull specific metadata for cards