In [17]:
'''
Scrape Data from tables on many different pages from the Twilight Imperium Wiki.
Assemble into two data frames for Space and Ground forces. 
'''
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import os

# Additional dependencies:
# pip install lxml

In [18]:
def sort_abilities(abilities):
    has_sustain_damage = False
    standard_abilities = []
    unit_abilities = []

    for ability in abilities:
        if 'Sustain Damage' in ability:
            has_sustain_damage = True
        elif ('Anti-Fighter Barrage' in ability) or ('Space Cannon' in ability) or ('Bombardment' in ability) or ('Planetary Shield' in ability):
            standard_abilities.append(ability)
        else:
            unit_abilities.append(ability)

    standard_abilities = ' & '.join(standard_abilities)
    unit_abilities = ' & '.join(unit_abilities)
    return has_sustain_damage, standard_abilities, unit_abilities

In [19]:
# FLAGSHIPS

URL = 'https://twilight-imperium.fandom.com/wiki/Flagship'

# Fetch the webpage & find tables
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all('table', class_="article-table")

all_flagships = []
for table in tables:
    faction_name = table.find_previous_sibling('h3').get_text(strip=True).strip('[]')
    rows = [
        row for row in table.find_all('tr')
        if any(cell.get_text(strip=True) for cell in row.find_all(['td', 'th']))
        ]   # Exclude empty table rows for Memooria II
    unit_name = rows[0].get_text(strip=True).rstrip(' -')    # rstrip cleans Memooria II
    abilities = [
        line.strip()
        for line in rows[1].get_text().split('\n')
        if line.strip() and line.strip() != 'Req.'
    ]   # Remove 'Req.' and '' from Memooria II
    has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)   # standard and faction abilites '' if NaN

    row_data = rows[2].get_text().lstrip('\n').rstrip('\n').split('\n\n')
    cost = row_data[0]
    combat = row_data[1]
    combat_value = combat[0]
    try:
        shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
    except IndexError:
        shots = 1

    move = row_data[2]
    capacity = row_data[3]

    flagship_stats = {'Unit_Name': unit_name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities,
                       'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': move, 
                       'Capacity': capacity, 'Unit_Type': 'Flagship'}
    all_flagships.append(flagship_stats)

flagships_df = pd.DataFrame(all_flagships)
flagships_df

Unnamed: 0,Unit_Name,Faction_Name,Unit_Abilities,Standard_Abilities,Has_Sustain_Damage,Cost,Combat,Combat_Value,Shots,Move,Capacity,Unit_Type
0,Duha Menaimon,The Arborec,"After you activate this system, you may produc...",,True,8,7 (x2),7,2,1,5,Flagship
1,Arc Secundus,The Barony of Letnev,Other players' units in this system lose PLANE...,Bombardment 5 (x3),True,8,5 (x2),5,2,1,3,Flagship
2,Son of Ragh,The Clan of Saar,,Anti-Fighter Barrage 6 (x4),True,8,5 (x2),5,2,1,3,Flagship
3,The Inferno,The Embers of Muaat,ACTION: Spend 1 token from your strategy pool ...,,True,8,5 (x2),5,2,1,3,Flagship
4,Wrath of Kenara,The Emirates of Hacan,After you roll a die during space combat in th...,,True,8,7 (x2),7,2,1,3,Flagship
5,Genesis,The Federation of Sol,"At the end of the status phase, place 1 infant...",,True,8,5 (x2),5,2,1,12,Flagship
6,Hil Colish,The Ghosts of Creuss,This ship's system contains a delta wormhole &...,,True,8,5,5,1,1,3,Flagship
7,[0.0.1],The L1Z1X Mindnet,"During a space combat, hits produced by this s...",,True,8,5 (x2),5,2,1,5,Flagship
8,Fourth Moon,The Mentak Coalition,Other players' ships in this system cannot use...,,True,8,7 (x2),7,2,1,3,Flagship
9,Matriarch,The Naalu Collective,"During an invasion in this system, you may com...",,True,8,9 (x2),9,2,1,6,Flagship


In [20]:
# OTHER SHIPS

ships_list = ['Carrier', 'Cruiser', 'Destroyer', 'Dreadnought', 'Fighter', 'War_Sun']
ships = []

for ship in ships_list:
    URL = f'https://twilight-imperium.fandom.com/wiki/{ship}'

    # Fetch the webpage
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', class_="article-table")

    for table in tables:
        previous_heading = table.find_previous_sibling('h3')
        if previous_heading:
            faction_name = previous_heading.get_text(strip=True).strip('[]')
        else:
            faction_name = 'Common Unit'

        unit_name = table.find('th').get_text(strip=True)
        rows = table.find_all('tr')
        abilities = rows[1].get_text().strip('\n').strip('Upg.').lstrip('Req.').strip('Cost').strip().split('\n')
        has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)

        table = pd.read_html(StringIO(str(table)))
        table = table[0]

        row_data = table.iloc[-2].dropna()
        cost = row_data.iloc[0]
        combat = row_data.iloc[1]
        combat_value = combat[0]
        try:
            shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
        except IndexError:
            shots = 1

        move = row_data.iloc[2]
        try:
            capacity = row_data.iloc[3]
        except IndexError:
            capacity = 0

        ship_stats = {'Unit_Name': unit_name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities,
                      'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': move, 
                      'Capacity': capacity, 'Unit_Type': ship}
        ship_stats
        ships.append(ship_stats)

ships_df = pd.DataFrame(ships)

# Drop Unbuildable Base War_Sun
base_war_sun_index = ships_df[(ships_df['Unit_Name'] == 'War Sun') & (ships_df['Combat'] == '-^')].index
ships_df = ships_df.drop(index = base_war_sun_index).reset_index(drop = True)

ships_df

Unnamed: 0,Unit_Name,Faction_Name,Unit_Abilities,Standard_Abilities,Has_Sustain_Damage,Cost,Combat,Combat_Value,Shots,Move,Capacity,Unit_Type
0,Carrier,Common Unit,,,False,3,9,9,1,1^,4^,Carrier
1,Carrier II,Common Unit,,,False,3,9,9,1,2,6,Carrier
2,Advanced Carrier,The Federation of Sol,,,False,3,9,9,1,1^,6^,Carrier
3,Advanced Carrier II,The Federation of Sol,,,True,3,9,9,1,2,8,Carrier
4,Cruiser,Common Unit,,,False,2,7^,7,1,2^,-^,Cruiser
5,Cruiser II,Common Unit,,,False,2,6,6,1,3,1,Cruiser
6,Saturn Engine I,The Titans of Ul,,,False,2,7^,7,1,2^,1^,Cruiser
7,Saturn Engine II,The Titans of Ul,,,True,2,6,6,1,3,2,Cruiser
8,Destroyer,Common Unit,,Anti-Fighter Barrage 9 (x2),False,1,9^,9,1,2,0,Destroyer
9,Destroyer II,Common Unit,,Anti-Fighter Barrage 6 (x3),False,1,8,8,1,2,0,Destroyer


In [21]:
# GROUND FORCES
    
ground_forces_list = ['Infantry', 'Mechs']
ground_forces = []

for ground_force in ground_forces_list:
    URL = f'https://twilight-imperium.fandom.com/wiki/{ground_force}'

    # Fetch the webpage
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', class_="article-table")

    for table in tables:
        previous_heading = table.find_previous_sibling('h3')
        if previous_heading:
            faction_name = previous_heading.get_text(strip=True).strip('[]')
        else:
            faction_name = 'Common Unit'
            
        name = table.find('th').get_text(strip=True)
        rows = table.find_all('tr')
        abilities = rows[1].get_text().strip('\n').strip('Upg.').lstrip('Req.').strip('Cost').strip().split('\n')
        has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)
        
        table = pd.read_html(StringIO(str(table)))
        table = table[0]

        ability = table.iloc[0, 1]
        try:
            has_sustain_damage = 'Sustain Damage' in ability
        except TypeError:
            has_sustain_damage = False
        row_data = table.iloc[-2].dropna()
        cost = row_data.iloc[0]
        combat = row_data.iloc[1]
        combat_value = combat[0]
        try:
            shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
        except IndexError:
            shots = 1

        ground_force_stats = {'Unit_Name': name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities, 
                              'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': None,
                              'Capacity': None, 'Unit_Type': ground_force}
        ground_forces.append(ground_force_stats)

ground_forces_df = pd.DataFrame(ground_forces)
ground_forces_df

Unnamed: 0,Unit_Name,Faction_Name,Unit_Abilities,Standard_Abilities,Has_Sustain_Damage,Cost,Combat,Combat_Value,Shots,Move,Capacity,Unit_Type
0,Infantry,Common Unit,,,False,1 (x2),8^,8,1,,,Infantry
1,Infantry II,Common Unit,"After this unit is destroyed, roll 1 die. If t...",,False,1 (x2),7,7,1,,,Infantry
2,Spec Ops I,The Federation of Sol,,,False,1 (x2),7^,7,1,,,Infantry
3,Spec Ops II,The Federation of Sol,"After this unit is destroyed, roll 1 die. If t...",,False,1 (x2),6,6,1,,,Infantry
4,Letani Warrior I,The Arborec,Production 1,,False,1 (x2),8^,8,1,,,Infantry
5,Letani Warrior II,The Arborec,"After this unit is destroyed, roll 1 die. If t...",,False,1 (x2),7,7,1,,,Infantry
6,Crimson Legionnaire I,The Mahact Gene Sorcerers,"After this unit is destroyed, gain 1 commodity...",,False,1 (x2),8^,8,1,,,Infantry
7,Crimson Legionnaire II,The Mahact Gene Sorcerers,"After this unit is destroyed, gain 1 commodity...",,False,1 (x2),7,7,1,,,Infantry
8,Letani Behemoth,The Arborec,DEPLOY: When you use MITOSIS faction ability y...,Planetary Shield,True,2,6,6,1,,,Mechs
9,Aerie Sentinel,The Argent Flight,This unit does not count against capacity if i...,,True,2,6,6,1,,,Mechs


In [22]:
# Combine Flagships and Other Ships
space_units_df = flagships_df.merge(ships_df, how='outer')
ground_units_df = ground_forces_df

In [23]:
if not os.path.exists('../data/scraped'):
    os.makedirs('../data/scraped')

space_units_df.to_csv('../data/scraped/space_units_df.csv', index=False)
ground_units_df.to_csv('../data/scraped/ground_units_df.csv', index=False)