In [2]:
# import packages

import time
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

### MTGO Decklists Scraping

#### Find all Links to Decklists from Main MTGO Page

In [3]:
# Set up Splinter
browser = Browser('chrome')

# URL for all MTGO league results from January 2024
# This could be edited as user input in the future, if this method is performant
url = f"https://www.mtgo.com/decklists/2024/01"
browser.visit(url)

# Parse the website
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [4]:
# close first browser since data is saved
browser.quit()

In [6]:
# Get all decklist items from main page
decklist_items = soup.find_all('li', class_='decklists-item')

# Loop through decklist items to store specific information
decklist_links = []
count = 0
for item in decklist_items:
    event_id = count
    count += 1
    event = item.find('h3').text.split(' ')
    link = item.find('a', class_ = 'decklists-link')['href']
    #date = item.find('span', class_= 'month').text + ' ' + item.find('span', class_= 'day').text + ', ' + item.find('span', class_= 'year').text
    date = item.find('time')['datetime']
    decklist_links.append({'event id': event_id, 'format': event[0], 'event type': event[1], 'link': 'https://www.mtgo.com'+link, 'date': date})

# Show the results of the scrape as a dataframe
decklist_links_df = pd.DataFrame(decklist_links)

# Filter to just league results
league_links_df = decklist_links_df[decklist_links_df['event type'] == 'League'].reset_index(drop=True)

# Reformat the date to a readable format while preserving the datetime dtype
league_links_df['date'] = pd.to_datetime(league_links_df['date']).dt.date.astype('datetime64[ns]')

# Save data to CSV
league_links_df.to_csv("data/league_information.csv", index=False)

league_links_df


Unnamed: 0,event id,format,event type,link,date
0,1,Legacy,League,https://www.mtgo.com/decklist/legacy-league-20...,2024-01-24
1,2,Modern,League,https://www.mtgo.com/decklist/modern-league-20...,2024-01-24
2,3,Pauper,League,https://www.mtgo.com/decklist/pauper-league-20...,2024-01-24
3,4,Pioneer,League,https://www.mtgo.com/decklist/pioneer-league-2...,2024-01-24
4,5,Standard,League,https://www.mtgo.com/decklist/standard-league-...,2024-01-24
...,...,...,...,...,...
139,278,Modern,League,https://www.mtgo.com/decklist/modern-league-20...,2024-01-01
140,279,Pauper,League,https://www.mtgo.com/decklist/pauper-league-20...,2024-01-01
141,280,Pioneer,League,https://www.mtgo.com/decklist/pioneer-league-2...,2024-01-01
142,281,Standard,League,https://www.mtgo.com/decklist/standard-league-...,2024-01-01


In [38]:
league_links_df.dtypes

event id               int64
format                object
event type            object
link                  object
date          datetime64[ns]
dtype: object

#### For each Link, visit and scrape the decklists

In [7]:
# WARNING ------------------------
# Executing this cell will take nearly an hour to fully collect the data if the loop break is removed.
# Currently getting the last 10 tournaments worth of 5-0 decklists. This runs in a few minutes.

count = 0 
decklists = []
browser = Browser('chrome')

for index, row in league_links_df.iterrows():

    # Grab the link and event_id from the league links dataframe previously scraped 
    url = row['link']
    event_id = row['event id']

    # Visit the URL, sleeping for 15 seconds as the pages are slow to load
    browser.visit(url)
    time.sleep(10)

    # Get the HTML content
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Process the HTML content and extract data
    decklist_items = soup.find_all('section', class_='decklist')
    deck_count = 0
    for item in decklist_items:
        player = item.find('p', class_ = 'decklist-player').text.split(' ')[0]
        maindeck = item.find('div', class_ = 'decklist-category-columns').find_all('ul', class_=lambda x: x and 'decklist-category-list' in x.split() and 'decklist-sideboard' not in x.split())
        main = []
        for deck in maindeck:
            cards = deck.find_all('a', class_ = 'decklist-card-link')
            for card in cards:
                main.append(card.text.split(' ', 1))
        sideboard = item.find(lambda tag: tag.name == 'ul' and 'decklist-sideboard' in tag.get('class', [])).find_all('li', class_ = 'decklist-category-card')
        side = []
        for deck in sideboard:
            cards = deck.find_all('a', class_ = 'decklist-card-link')
            for card in cards:
                side.append(card.text.split(' ', 1))
        decklists.append({'event id': event_id, 'player': player, 'maindeck': main, 'sideboard': side})
        deck_count += 1
    
    # Status Update
    print(f"{deck_count} decks added for event {event_id}")

    # Check if count is 10, then terminate the loop
    # This is for testing purposes - stops after fetching 10 events
    count += 1
    if count == 10:
        print("Terminating the loop at count 10")
        break

# Close the browswer after all data collected
browser.quit()
    
    

    

4 decks added for event 1
16 decks added for event 2
5 decks added for event 3
2 decks added for event 4
1 decks added for event 5
1 decks added for event 6
17 decks added for event 13
25 decks added for event 14
22 decks added for event 15
8 decks added for event 16
Terminating the loop at count 10


In [8]:
# Set the results of the scrape to a dataframe
decklists_df = pd.DataFrame(decklists)

# Save the data as a CSV
decklists_df.to_csv("test_decklists.csv", index=False)

# View the dataframe
decklists_df

Unnamed: 0,event id,player,maindeck,sideboard
0,1,avin,"[[3, Mishra's Bauble], [4, Delver of Secrets],...","[[1, Counterbalance], [2, Force of Negation], ..."
1,1,Ark4n,"[[2, The One Ring], [3, Birds of Paradise], [1...","[[2, Back to Basics], [1, Blue Elemental Blast..."
2,1,pirol94,"[[3, Brazen Borrower], [4, Elvish Spirit Guide...","[[2, Brotherhood's End], [3, Endurance], [4, F..."
3,1,learntolove6,"[[4, Delver of Secrets], [4, Grief], [2, Murkt...","[[2, Blue Elemental Blast], [1, Brazen Borrowe..."
4,2,distanthamster,"[[4, Amulet of Vigor], [1, Expedition Map], [4...","[[1, Bojuka Bog], [1, Boseiju, Who Endures], [..."
...,...,...,...,...
96,16,maxxattack,"[[2, Dreadhorde Arcanist], [4, Favored Hoplite...","[[2, Flowstone Infusion], [2, Get Lost], [1, J..."
97,16,tchuco,"[[1, Reckoner Bankbuster], [3, Smuggler's Copt...","[[2, Aclazotz, Deepest Betrayal], [1, Damping ..."
98,16,TheOutlawinLa,"[[4, Elvish Mystic], [1, Glissa Sunslayer], [1...","[[2, Bitter Triumph], [3, Cankerbloom], [2, Fa..."
99,16,hugofreitas1,"[[4, Arclight Phoenix], [4, Ledger Shredder], ...","[[1, Abrade], [2, Brazen Borrower], [2, Brothe..."


#### For each event, Find the total number of each card used

In [13]:
# Maindeck

# Parse all of the maindecks from the event data
maindeck_cards = []
for index, row in decklists_df.iterrows():
    event_id = row['event id']
    maindeck = row['maindeck']
    for card in maindeck:
        maindeck_cards.append({'event id': event_id, 'copies': int(card[0]), 'card name': card[1]})

# Set up a dataframe with this information
maindeck_cards_df = pd.DataFrame(maindeck_cards)

# Some card name cleanup to interact better with the mtg sdk used later
maindeck_cards_df['card name'] = maindeck_cards_df['card name'].str.replace('/', ' // ')
maindeck_cards_df['card name'] = maindeck_cards_df['card name'].str.replace('"Name Sticker"', '_____')

# Clean the data: Group by 'event id' and 'card name', and sum the 'copies'
maindeck_cards_clean_df = maindeck_cards_df.groupby(['event id', 'card name'], as_index=False)['copies'].sum()

maindeck_cards_clean_df.to_csv("data/maindeck_cards.csv", index=False)
maindeck_cards_clean_df 

Unnamed: 0,event id,card name,copies
0,1,Birds of Paradise,3
1,1,Brainstorm,12
2,1,Brazen Borrower,3
3,1,Collector Ouphe,1
4,1,Crashing Footfalls,3
...,...,...,...
1110,16,Wildgrowth Walker,4
1111,16,Woe Strider,4
1112,16,Zagoth Triome,3
1113,16,Ziatora's Proving Ground,1


In [14]:
# Sideboard

# Parse all of the sideboards from the event data
sideboard_cards = []
for index, row in decklists_df.iterrows():
    event_id = row['event id']
    sideboard = row['sideboard']
    for card in sideboard:
        sideboard_cards.append({'event id': event_id, 'copies': int(card[0]), 'card name': card[1]})

# Set up a dataframe with this information
sideboard_cards_df = pd.DataFrame(sideboard_cards)

# Some card name cleanup to interact better with the mtg sdk used later
sideboard_cards_df['card name'] = sideboard_cards_df['card name'].str.replace('/', ' // ')
sideboard_cards_df['card name'] = sideboard_cards_df['card name'].str.replace('"Name Sticker"', '_____')

# Clean the data: Group by 'event id' and 'card name', and sum the 'copies'
sideboard_cards_clean_df = sideboard_cards_df.groupby(['event id', 'card name'], as_index=False)['copies'].sum()

sideboard_cards_clean_df.to_csv("data/sideboard_cards.csv", index=False)
sideboard_cards_clean_df 

Unnamed: 0,event id,card name,copies
0,1,Back to Basics,2
1,1,Blue Elemental Blast,3
2,1,Brazen Borrower,1
3,1,Brotherhood's End,2
4,1,Chill,2
...,...,...,...
416,16,Thoughtseize,2
417,16,Unlicensed Hearse,2
418,16,Unmoored Ego,3
419,16,Voice of Resurgence,2


Next I should be able to export all of these tables and set up a postgres database, using the event id as the primary key to connect all of the tables. 