In [235]:
'''
Scrape Data from tables on many different pages from the Twilight Imperium Wiki.
Assemble into two data frames for Space and Ground forces. 
'''
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import os

In [236]:
def sort_abilities(abilities):
    has_sustain_damage = False
    standard_abilities = []
    unit_abilities = []

    for ability in abilities:
        if 'Sustain Damage' in ability:
            has_sustain_damage = True
        elif ('Anti-Fighter Barrage' in ability) or ('Space Cannon' in ability) or ('Bombardment' in ability):
            standard_abilities.append(ability)
        else:
            unit_abilities.append(ability)

    standard_abilities = ' & '.join(standard_abilities)
    unit_abilities = ' & '.join(unit_abilities)
    return has_sustain_damage, standard_abilities, unit_abilities

In [237]:
# FLAGSHIPS

URL = 'https://twilight-imperium.fandom.com/wiki/Flagship'

# Fetch the webpage & find tables
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all('table', class_="article-table")

all_flagships = []
for table in tables:
    faction_name = table.find_previous_sibling('h3').get_text(strip=True).strip('[]')
    rows = table.find_all('tr')
    unit_name = rows[0].get_text(strip=True).rstrip(' -')    # rstrip cleans Memooria II
    abilities = rows[1].get_text().lstrip('\n').rstrip('\n').split('\n')
    has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)   # standard and faction abilites '' if NaN

    row_data = rows[2].get_text().lstrip('\n').rstrip('\n').split('\n\n')
    cost = row_data[0]
    combat = row_data[1]
    combat_value = combat[0]
    try:
        shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
    except IndexError:
        shots = 1

    move = row_data[2]
    capacity = row_data[3]

    flagship_stats = {'Unit_Name': unit_name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities,
                       'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': move, 
                       'Capacity': capacity, 'Unit_Type': 'Flagship'}
    all_flagships.append(flagship_stats)

flagships_df = pd.DataFrame(all_flagships)
# flagships_df

In [238]:
# OTHER SHIPS
ships_list = ['Carrier', 'Cruiser', 'Destroyer', 'Dreadnought', 'Fighter', 'War_Sun']
ships = []

for ship in ships_list:
    URL = f'https://twilight-imperium.fandom.com/wiki/{ship}'

    # Fetch the webpage
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', class_="article-table")

    for table in tables:
        previous_heading = table.find_previous_sibling('h3')
        if previous_heading:
            faction_name = previous_heading.get_text(strip=True).strip('[]')
        else:
            faction_name = 'Common Unit'

        unit_name = table.find('th').get_text(strip=True)
        rows = table.find_all('tr')
        abilities = rows[1].get_text().strip('\n').strip('Upg.').lstrip('Req.').strip('Cost').strip().split('\n')
        has_sustain_damage, standard_abilities, unit_abilities = sort_abilities(abilities)

        table = pd.read_html(StringIO(str(table)))
        table = table[0]

        row_data = table.iloc[-2].dropna()
        cost = row_data.iloc[0]
        combat = row_data.iloc[1]
        combat_value = combat[0]
        try:
            shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
        except IndexError:
            shots = 1

        move = row_data.iloc[2]
        try:
            capacity = row_data.iloc[3]
        except IndexError:
            capacity = 0

        ship_stats = {'Unit_Name': unit_name, 'Faction_Name': faction_name, 'Unit_Abilities': unit_abilities, 'Standard_Abilities': standard_abilities, 
                    'Has_Sustain_Damage': has_sustain_damage, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots, 'Move': move, 
                        'Capacity': capacity, 'Unit_Type': ship}
        ship_stats
        ships.append(ship_stats)

ships_df = pd.DataFrame(ships)
ships_df = ships_df.drop(index=22).reset_index(drop=True)   # Drop Unbuildable War_Sun
#ships_df

In [239]:
# GROUND FORCES

ground_forces_list = ['Infantry', 'Mechs']
ground_forces = []

for ground_force in ground_forces_list:
    URL = f'https://twilight-imperium.fandom.com/wiki/{ground_force}'

    # Fetch the webpage
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    tables = soup.find_all('table', class_="article-table")
##################################################
# Jumping to tables means currently missing faction
# name for faction specific units.
##################################################

    for table in tables:
        name = table.find('th').get_text(strip=True)
        table = pd.read_html(StringIO(str(table)))
        table = table[0]

        ability = table.iloc[0, 1]
        try:
            has_sustain_damage = 'Sustain Damage' in ability
        except TypeError:
            has_sustain_damage = False
        row_data = table.iloc[-2].dropna()
        cost = row_data.iloc[0]
        combat = row_data.iloc[1]
        combat_value = combat.strip('^').strip(')').strip('2').rstrip('3').strip('x').strip('(').strip(' ')
        try:
            shots = combat.strip('^').split(' ')[1].strip('()').strip('x')
        except IndexError:
            shots = 1

        ground_force_stats = {'Unit_Name': name, 'Cost': cost, 'Combat': combat, 'Combat_Value': combat_value, 'Shots': shots,
                              'Ability': ability, 'Has_Sustain_Damage': has_sustain_damage, 'Unit_Type': ground_force}
        ground_forces.append(ground_force_stats)

ground_forces_df = pd.DataFrame(ground_forces)
# ground_forces_df

In [240]:
space_units_df = flagships_df.merge(ships_df, how='outer')
ground_units_df = ground_forces_df

In [241]:
ground_units_df

Unnamed: 0,Unit_Name,Cost,Combat,Combat_Value,Shots,Ability,Has_Sustain_Damage,Unit_Type
0,Infantry,1 (x2),8^,8,1,,False,Infantry
1,Infantry II,1 (x2),7,7,1,"After this unit is destroyed, roll 1 die. If t...",False,Infantry
2,Spec Ops I,1 (x2),7^,7,1,,False,Infantry
3,Spec Ops II,1 (x2),6,6,1,"After this unit is destroyed, roll 1 die. If t...",False,Infantry
4,Letani Warrior I,1 (x2),8^,8,1,Production 1,False,Infantry
5,Letani Warrior II,1 (x2),7,7,1,"After this unit is destroyed, roll 1 die. If t...",False,Infantry
6,Crimson Legionnaire I,1 (x2),8^,8,1,"After this unit is destroyed, gain 1 commodity...",False,Infantry
7,Crimson Legionnaire II,1 (x2),7,7,1,"After this unit is destroyed, gain 1 commodity...",False,Infantry
8,Letani Behemoth,2,6,6,1,DEPLOY: When you use MITOSIS faction ability y...,True,Mechs
9,Aerie Sentinel,2,6,6,1,This unit does not count against capacity if i...,True,Mechs


In [242]:
if not os.path.exists('../Data'):
    os.makedirs('../Data')

space_units_df.to_csv('../Data/space_units_df.csv', index=False)
ground_units_df.to_csv('../Data/ground_units_df.csv', index=False)

In [243]:
test = ['apples']
test = [test]
test

[['apples']]