In [1]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
import datetime
import pandas as pd
import lxml

In [87]:
# utility functions 

# gets alt tags for images that are otherwise given in table as NaN, for levels and country
def getAlt(x):
    for img in x.find_all('img'):
        if(img.get("alt")):
#             print(img.get("alt"))
            img.string = img.get("alt")
    
def getLinkData(x):
    for a in x.find_all('a'):
        getAlt(a)
        
# gets the table data from the raw scraping, organizing lists of data for the icons and links
def getTableData(row):
    if('tr_header' in row.get_attribute_list('class')):
        return row.td.get_text()
    results = []
    for div in row.find_all('div'):
        a = div.find('a')
        img = a.find('img')
        val = {
            "link": a.get('href'),
            "text": a.get_text(),
            "image": img.get('src')
        }
        results.append(val)
        
    return results
            
# reorganizes the raw scraped data into a list of dictionaries
# with identical keys (to become columns)
def collapseHeaderRows(row_list, condensor_str):
    i = 0
    curr_grouping = row_list[i]
    updated_data_list = []
    while(i < len(row_list)):
        if(row_list[i] == str(row_list[i])):
            curr_grouping = row_list[i]
            row_list.remove(curr_grouping)
        else:
            for j in range(len(row_list[i])):
                row_list[i][j][condensor_str] = curr_grouping
                updated_data_list.append(row_list[i][j])
            i += 1
                
    return updated_data_list
        
# creates a DF from the reorganized data, with custom names for
# the values to combine (from the "header" row above the list of items in the table) and
# the "text" of those items (may be "Factions", "Unit Types", etc.)
def createDFfromRowData(row_data, condensor_str, text_name):
    header = row_data.pop(0)
    row_data_condensed = collapseHeaderRows(row_data, condensor_str)

    return pd.DataFrame(data={
        condensor_str:[a[condensor_str] for a in row_data_condensed],
        text_name: [a["text"] for a in row_data_condensed],
        "link": [a["link"] for a in row_data_condensed],
        "image": [a["image"] for a in row_data_condensed],
    })

In [93]:

# start by getting data for each playable faction, then add unit scraping on top of it (helper function later)
def getFactionsData():
    url = "https://www.honga.net/totalwar/rome2/?l=en&v=rome2"

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(url)
    
    #add a delay so page fully loads
    time.sleep(6)
    
    factions = []
    
    try:
        current_leaderboard = browser.html

        news_soup = bs(current_leaderboard, "html.parser")

        playable_factions = news_soup.find_all("table")[0]


        
        for i, row in enumerate(playable_factions.find_all('tr')):
            val = row.get_text().strip()
            if(not i == 0):
                val = getTableData(row)
            factions.append(val)
            
    except:
        print("Something went wrong!")
        
    
    browser.quit()

    return factions

IndentationError: expected an indented block (<ipython-input-93-361ccae8bd16>, line 33)

In [67]:
#get basic playable faction data (by scraping honga.net)
output_df = getFactionsData()

output_df

['Playable Factions',
 'Grand Campaign',
 [{'link': 'https://www.honga.net/totalwar/rome2/faction.php?l=en&v=rome2&f=rom_ardiaei',
   'text': 'Ardiaei',
   'image': 'https://www.honga.net/totalwar/rome2/images/rome2/flags/ardiaei/mon_64.png'},
  {'link': 'https://www.honga.net/totalwar/rome2/faction.php?l=en&v=rome2&f=rom_arevaci',
   'text': 'Arevaci',
   'image': 'https://www.honga.net/totalwar/rome2/images/rome2/flags/arevaci/mon_64.png'},
  {'link': 'https://www.honga.net/totalwar/rome2/faction.php?l=en&v=rome2&f=rom_armenia',
   'text': 'Armenia',
   'image': 'https://www.honga.net/totalwar/rome2/images/rome2/flags/armenia/mon_64.png'},
  {'link': 'https://www.honga.net/totalwar/rome2/faction.php?l=en&v=rome2&f=rom_arverni',
   'text': 'Arverni',
   'image': 'https://www.honga.net/totalwar/rome2/images/rome2/flags/arverni/mon_64.png'},
  {'link': 'https://www.honga.net/totalwar/rome2/faction.php?l=en&v=rome2&f=rom_athens',
   'text': 'Athens',
   'image': 'https://www.honga.net/to

In [92]:
factions = output_df.copy()
check_df = createDFfromRowData(factions, "Campaign", "Faction")
check_df.to_csv("./output/Factions.csv", index=False)
check_df

Unnamed: 0,Campaign,Faction,link,image
0,Grand Campaign,Ardiaei,https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
1,Grand Campaign,Arevaci,https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
2,Grand Campaign,Armenia,https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
3,Grand Campaign,Arverni,https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
4,Grand Campaign,Athens,https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
...,...,...,...,...
74,Rise of the Republic,Senones (Rise of the Republic),https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
75,Rise of the Republic,Syracuse (Rise of the Republic),https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
76,Rise of the Republic,Taras (Rise of the Republic),https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
77,Rise of the Republic,Tarchuna (Rise of the Republic),https://www.honga.net/totalwar/rome2/faction.p...,https://www.honga.net/totalwar/rome2/images/ro...
