In [259]:
'''
Scrape Data from tables on many different pages from the Twilight Imperium Wiki.
Assemble into two data frames for Space and Ground forces. 
'''
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import os

In [260]:
def sort_abilities(abilities):
    has_sustain_damage = False
    standard_abilities = []
    unit_abilities = []

    for ability in abilities:
        if 'Sustain Damage' in ability:
            has_sustain_damage = True
        elif ('Anti-Fighter Barrage' in ability) or ('Space Cannon' in ability) or ('Bombardment' in ability) or ('Planetary Shield' in ability):
            standard_abilities.append(ability)
        else:
            unit_abilities.append(ability)

    standard_abilities = ' & '.join(standard_abilities)
    unit_abilities = ' & '.join(unit_abilities)
    return has_sustain_damage, standard_abilities, unit_abilities

In [261]:
# FLAGSHIPS

URL = 'https://twilight-imperium.fandom.com/wiki/Flagship'

# Fetch the webpage & find tables
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all('table', class_="article-table")

all_flagships = []
for table in tables:
    faction_name = table.find_previous_sibling('h3').get_text(strip=True).strip('[]')
    rows = table.find_all('tr')
    unit_name = rows[0].get_text(strip=True).rstrip(' -')    # rstrip cleans Memooria II
    abilities = rows[1].get_text().lstrip('\n').rstrip('\n').split('\n')
    has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)   # standard and faction abilites '' if NaN

    row_data = rows[2].get_text().lstrip('\n').rstrip('\n').split('\n\n')
    cost = row_data[0]
    combat = row_data[1]
    combat_value = combat[0]
    try:
        shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
    except IndexError:
        shots = 1

    move = row_data[2]
    capacity = row_data[3]

    flagship_stats = {'Unit_Name': unit_name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities,
                       'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': move, 
                       'Capacity': capacity, 'Unit_Type': 'Flagship'}
    all_flagships.append(flagship_stats)

flagships_df = pd.DataFrame(all_flagships)
# flagships_df

In [262]:
# OTHER SHIPS

ships_list = ['Carrier', 'Cruiser', 'Destroyer', 'Dreadnought', 'Fighter', 'War_Sun']
ships = []

for ship in ships_list:
    URL = f'https://twilight-imperium.fandom.com/wiki/{ship}'

    # Fetch the webpage
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', class_="article-table")

    for table in tables:
        previous_heading = table.find_previous_sibling('h3')
        if previous_heading:
            faction_name = previous_heading.get_text(strip=True).strip('[]')
        else:
            faction_name = 'Common Unit'

        unit_name = table.find('th').get_text(strip=True)
        rows = table.find_all('tr')
        abilities = rows[1].get_text().strip('\n').strip('Upg.').lstrip('Req.').strip('Cost').strip().split('\n')
        has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)

        table = pd.read_html(StringIO(str(table)))
        table = table[0]

        row_data = table.iloc[-2].dropna()
        cost = row_data.iloc[0]
        combat = row_data.iloc[1]
        combat_value = combat[0]
        try:
            shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
        except IndexError:
            shots = 1

        move = row_data.iloc[2]
        try:
            capacity = row_data.iloc[3]
        except IndexError:
            capacity = 0

        ship_stats = {'Unit_Name': unit_name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities,
                      'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': move, 
                      'Capacity': capacity, 'Unit_Type': ship}
        ship_stats
        ships.append(ship_stats)

ships_df = pd.DataFrame(ships)
ships_df = ships_df.drop(index=22).reset_index(drop=True)   # Drop Unbuildable War_Sun
#ships_df

In [263]:
# GROUND FORCES

ground_forces_list = ['Infantry', 'Mechs']
ground_forces = []

for ground_force in ground_forces_list:
    URL = f'https://twilight-imperium.fandom.com/wiki/{ground_force}'

    # Fetch the webpage
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', class_="article-table")

    for table in tables:
        previous_heading = table.find_previous_sibling('h3')
        if previous_heading:
            faction_name = previous_heading.get_text(strip=True).strip('[]')
        else:
            faction_name = 'Common Unit'
            
        name = table.find('th').get_text(strip=True)
        rows = table.find_all('tr')
        abilities = rows[1].get_text().strip('\n').strip('Upg.').lstrip('Req.').strip('Cost').strip().split('\n')
        has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)
        
        table = pd.read_html(StringIO(str(table)))
        table = table[0]

        ability = table.iloc[0, 1]
        try:
            has_sustain_damage = 'Sustain Damage' in ability
        except TypeError:
            has_sustain_damage = False
        row_data = table.iloc[-2].dropna()
        cost = row_data.iloc[0]
        combat = row_data.iloc[1]
        combat_value = combat[0]
        try:
            shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
        except IndexError:
            shots = 1

        ground_force_stats = {'Unit_Name': name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities, 
                              'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Unit_Type': ground_force}
        ground_forces.append(ground_force_stats)

ground_forces_df = pd.DataFrame(ground_forces)
# ground_forces_df

In [264]:
space_units_df = flagships_df.merge(ships_df, how='outer')
ground_units_df = ground_forces_df

In [265]:
if not os.path.exists('../Data'):
    os.makedirs('../Data')

space_units_df.to_csv('../Data/space_units_df.csv', index=False)
ground_units_df.to_csv('../Data/ground_units_df.csv', index=False)