In [1]:
import numpy as np
import pandas as pd 
import requests
from bs4 import BeautifulSoup, SoupStrainer

In [250]:
import aiohttp
import asyncio
import pickle
PICKLE_FILEPATHS = {'overview':'overview_htmls.pkl', 'player':'player_htmls.pkl'}

async def fetch(session, url):
    with aiohttp.Timeout(30):
        async with session.get(url) as response:
            return await response.text()

async def fetch_all(session, urls, loop):
    results = await asyncio.gather(
        *[fetch(session, url) for url in urls],
        return_exceptions=True  # so we can deal with exceptions later
    )

    return results

def get_htmls_from_pickle(file_key):
    with open(PICKLE_FILEPATH[file_key], 'rb') as f:
        htmls = pickle.load(f)
    return htmls

def save_htmls_to_pickle(htmls, file_key):
    with open(PICKLE_FILEPATH[file_key], 'wb') as f:
        pickle.dump(overview_htmls, f)
    

def get_htmls(urls, from_file=False, file_key):
    if from_file:
        return get_htmls_from_pickle(file_key)
    else:
        loop = asyncio.get_event_loop()
        with aiohttp.ClientSession(loop=loop) as session:
            htmls = loop.run_until_complete(fetch_all(session, urls, loop))
    return dict(zip(urls, htmls))

In [251]:
def get_overview_urls():
    urls = []
    base_url = "https://sofifa.com/players?offset="
    offset_increment = 80
    for i in range(225):
        url = base_url + str(i * offset_increment)
        urls.append(url)
    return urls

In [252]:
def get_player_urls(IDs):
    urls = []
    base_url = 'https://sofifa.com/player/'
    for ID in IDs:
        url = base_url + str(ID)
        urls.append(url)
    return urls    

In [264]:
def get_player_htmls(IDs, from_file=False):
    urls = get_player_urls(IDs)
    return get_htmls(urls, from_file, file_key='player')

In [263]:
def get_overview_htmls(from_file=False):
    urls = get_overview_urls()
    return get_htmls(urls, from_file, file_key='overview')

In [6]:
#%time overview_htmls = get_htmls(get_overview_urls())

Wall time: 17.4 s


In [8]:
# with open(PICKLE_FILEPATH, 'wb') as f:
#     pickle.dump(overview_htmls, f)

In [241]:
with open(PICKLE_FILEPATH, 'rb') as f:
    x = pickle.load(f)

In [253]:
def parse_single_row(overview_table_row):
    
    record_dict = {}
    td = overview_table_row.find_all('td')
    record_dict['photo'] = td[0].find('img').get('data-src')
    record_dict['ID'] = td[0].find('img').get('id')
    record_dict['nationality'] = td[1].find('a').get('title')
    record_dict['flag'] = td[1].find('img').get('data-src')
    record_dict['name'] = td[1].find_all('a')[1].text
    record_dict['age'] = td[2].find('div').text.strip()
    record_dict['overall'] = td[3].text.strip()
    record_dict['potential'] = td[4].text.strip()
    record_dict['club'] = td[5].find('a').text
    record_dict['club_logo'] = td[5].find('img').get('data-src')
    record_dict['value'] = td[7].text
    record_dict['wage'] = td[8].text
    record_dict['special'] = td[17].text
    
    return record_dict

In [254]:
def parse_single_overview_page(html, strainer):
    soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
    row_dicts = []
    for row in soup.find_all('tr'):
        row_dicts.append(parse_single_row(row))
    return row_dicts

In [255]:
def parse_player_overview_data(overview_htmls):
    strainer = SoupStrainer('tbody') # perf: we only want to parse the tbody
    data = []
    for html in overview_htmls.values():
        row_dicts = parse_single_overview_page(html, strainer)
        data.extend(row_dicts)
    return pd.DataFrame.from_dict(data)

In [14]:
# doesn't work in notebook but should work otherwise
# import multiprocessing as mp
# num_workers = mp.cpu_count()
# pool = mp.Pool(num_workers)

In [None]:
# %%time

# def square(x):
#     return x**2

# if __name__ == '__main__':
#     pool.map(square, [1,3,5,7])

In [None]:
# %%time
# x = pool.map(parse_single_overview_page, overview_htmls)

In [256]:
def convert_currency(curr_col):
    without_euro_symbol = curr_col.str[1:]
    unit_symbol = without_euro_symbol.str[-1]
    numeric_part = np.where(unit_symbol == '0', 0, without_euro_symbol.str[:-1].pipe(pd.to_numeric))
    multipliers = unit_symbol.replace({'M':1e6, 'K':1e3}).pipe(pd.to_numeric)
    return numeric_part * multipliers

def clean_overview_data(df):
    return (df.assign(EUR_value = lambda df: df['value'].pipe(convert_currency), 
                                EUR_wage = lambda df: df['wage'].pipe(convert_currency))
            .drop(['value', 'wage'], axis=1))

#player_personal_data = df.pipe(clean_personal_data)

In [260]:
def get_overview_data(from_file=False):
    overview_htmls = get_overview_htmls()
    return parse_player_overview_data(overview_htmls).pipe(clean_overview_data)

In [None]:
def get_player_data(from_file=False):
    player_htmls = get_player_htmls()
    #...

In [265]:
%time player_overview_data = get_overview_data(from_file=True)

Wall time: 58 s


In [259]:
player_personal_data

Unnamed: 0,ID,age,club,club_logo,flag,name,nationality,overall,photo,potential,special,EUR_value,EUR_wage
0,20801,32,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,https://cdn.sofifa.org/flags/38.png,Cristiano Ronaldo,Portugal,94,https://cdn.sofifa.org/48/18/players/20801.png,94,2228,95500000.0,565000.0
1,158023,30,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,https://cdn.sofifa.org/flags/52.png,L. Messi,Argentina,93,https://cdn.sofifa.org/48/18/players/158023.png,93,2154,105000000.0,565000.0
2,190871,25,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,https://cdn.sofifa.org/flags/54.png,Neymar,Brazil,92,https://cdn.sofifa.org/48/18/players/190871.png,94,2100,123000000.0,280000.0
3,176580,30,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,https://cdn.sofifa.org/flags/60.png,L. Suárez,Uruguay,92,https://cdn.sofifa.org/48/18/players/176580.png,92,2291,97000000.0,510000.0
4,167495,31,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,https://cdn.sofifa.org/flags/21.png,M. Neuer,Germany,92,https://cdn.sofifa.org/48/18/players/167495.png,92,1493,61000000.0,230000.0
5,188545,28,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,https://cdn.sofifa.org/flags/37.png,R. Lewandowski,Poland,91,https://cdn.sofifa.org/48/18/players/188545.png,91,2146,92000000.0,355000.0
6,193080,26,Manchester United,https://cdn.sofifa.org/24/18/teams/11.png,https://cdn.sofifa.org/flags/45.png,De Gea,Spain,90,https://cdn.sofifa.org/48/18/players/193080.png,92,1458,64500000.0,215000.0
7,183277,26,Chelsea,https://cdn.sofifa.org/24/18/teams/5.png,https://cdn.sofifa.org/flags/7.png,E. Hazard,Belgium,90,https://cdn.sofifa.org/48/18/players/183277.png,91,2096,90500000.0,295000.0
8,182521,27,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,https://cdn.sofifa.org/flags/21.png,T. Kroos,Germany,90,https://cdn.sofifa.org/48/18/players/182521.png,90,2165,79000000.0,340000.0
9,167664,29,Juventus,https://cdn.sofifa.org/24/18/teams/45.png,https://cdn.sofifa.org/flags/52.png,G. Higuaín,Argentina,90,https://cdn.sofifa.org/48/18/players/167664.png,90,1961,77000000.0,275000.0


In [64]:
#player_personal_data.to_csv('Complete/PlayerPersonalData.csv', index=False)

In [21]:
def headline_attribute_from_line(line):
    equals_sign_loc = line.find('=')
    attribute_name = line[equals_sign_loc - 4: equals_sign_loc - 1].lower()
    attribute_value = int(line[equals_sign_loc+2:equals_sign_loc+4])
    return {'name':attribute_name, 'value':attribute_value}

In [22]:
def standardise_spelling(player_attribute_name):
    return player_attribute_name.lower().replace(' ', '_')

In [125]:
player_data_url = 'https://sofifa.com/player/20801'
# skill_names = ['ID', 'crossing', 'finishing', 'heading_accuracy','short_passing', 'volleys', 'dribbling', 'curve',
#                'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility',
#                'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
#                'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle',
#                'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']
# headline_attribute_names = ['PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY']
# all_attribute_names = skill_names + headline_attribute_names
player_attribute_dict = {'ID': 20801}

In [24]:
source_code = requests.get(player_data_url)

In [190]:
plain_text = source_code.text
strainer = SoupStrainer(['section', 'script'])
soup = BeautifulSoup(plain_text, 'lxml', parse_only=strainer)

In [172]:
def parse_main_attributes(soup):
    attribute_dict = {}
    skill_categories = soup.find_all('div', {'class': 'col-3'})
    for category in skill_categories[:-1]: # last div is empty
        skills = category.find_all('li')
        for skill in skills:
            skill_text_list = skill.text.split()
            skill_value = skill_text_list[0]
            skill_name = standardise_spelling(' '.join(skill_text_list[1:]))
            attribute_dict[skill_name] = skill_value
    return attribute_dict

headline attributes like PHY: seems to be related to Ultimate Team

In [196]:
def parse_headline_attributes(soup):
    attribute_dict = {}
    headline_attribute_script = soup.find_all('script')[1]
    for line in headline_attribute_script.text.split('\r\n'):
        if 'point' in line:
            attr_subdict = headline_attribute_from_line(line)
            attribute_dict[attr_subdict['name']] = attr_subdict['value']
    return attribute_dict

meta section at top of player page

In [198]:
def parse_player_metadata(soup):
    
    attribute_dict = {}
    player_info_html = soup.find('div', class_='meta').find('span')
    # nationality, age and flag were found in player overview
    attribute_dict['preferred_positions'] = [span.text for span in player_info_html.find_all('span')]
    age_height_weight = player_info_html.contents[-1].split()
    attribute_dict['birth_date'] = ' '.join(age_height_weight[2:5]).replace(',', '').strip('(').strip(')')
    attribute_dict['height_cm'] = age_height_weight[5].strip('cm')
    attribute_dict['weight_kg'] = age_height_weight[-1].strip('kg')
    return attribute_dict

In [211]:
page_uls = soup.find_all('ul', class_='pl')

In [208]:
def standardise_ul(ul):
    return list(standardise_spelling(item) for item in ul.stripped_strings)

def parse_traits_and_specialities(page_uls):
    return {'traits': standardise_ul(page_uls[-1]), 'specialities': standardise_ul(page_uls[-2])}

In [327]:
parse_traits_and_specialities(page_uls)

{'specialities': ['power_free_kick',
  'flair',
  'long_shot_taker',
  'skilled_dribbling'],
 'traits': ['speedster',
  'dribbler',
  'distance_shooter',
  'acrobat',
  'clinical_finisher',
  'complete_forward']}

In [210]:
def parse_player_miscellaneous_data(page_uls):
    data = page_uls[0]
    attribute_dict = {}
    generator = player_miscellaneous_data.stripped_strings
    for key in generator:
        attribute_dict[standardise_spelling(key)] = next(generator)
        attribute_dict[attribute_name] = attribute_value
    work_rates = attribute_dict.pop('work_rate').split(' / ')
    attribute_dict['work_rate_att'] = work_rates[0]
    attribute_dict['work_rate_def'] = work_rates[1]
    return attribute_dict

In [313]:
def get_position_ratings(soup):
    ratings_table = soup.find('table', class_='table table-hover')
    position_ratings_df = (pd.read_html(str(ratings_table))[0][['Position', 'OVA']]
                    .rename(columns=standardise_spelling))
    split_df = (position_ratings_df['position']
                .str.split(expand=True)
                .assign(ova=p['ova']))
    position_ratings_dict = (pd.concat(split_df[[i, 'ova']].rename(columns={i:'position'}) for i in range(3))
                             .dropna()
                             .set_index('position')
                             .to_dict()['ova'])
    return position_ratings_dict

In [315]:
def get_unique_positions(position_ratings):
    return position_ratings.keys()

In [320]:
position_ratings = get_position_ratings(soup)
unique_positions = get_unique_positions(position_ratings)

In [325]:
def get_full_position_preferences(preferred_positions_list, unique_positions):
    return {'prefers_' + pos: (pos in preferred_positions_list) for pos in unique_positions}

In [None]:
def main():
    # download overview htmls. Parse these into a dataframe and save this into a variable.
    # Use the df's ID column to get urls for player personal data.
    # for the first player url only, use the position ratings table to get a sequence of unique positions, and save this as a variable
    
    player_overview_data = get_overview_data()

In [None]:
full_data.to_csv('Allplayer.csv', encoding='utf-8')

In [None]:
master_data.to_csv('Complete/PlayerAttributeData.csv', encoding='utf-8')

In [None]:
full_data.to_csv('Complete/Dataset.csv', encoding='utf-8')

In [None]:
full_data

In [None]:
full_data.drop('Unnamed: 0', 1,  inplace=True)

In [None]:
full_data

In [None]:
full_data.drop('ID_x', 1,  inplace=True)

In [None]:
full_data['ID_y']

In [None]:
f = full_data.rename(index=str, columns={"ID_y": "ID"})

In [None]:
f['ID']

In [None]:
f.to_csv('Complete/Dataset.csv', encoding='utf-8')

In [None]:
f

In [179]:
heights = ['1cm' for i in range(1000000)]

In [201]:
%%time
s = pd.Series(heights)
s2 = s.str.strip('cm').astype('int')

Wall time: 694 ms


In [203]:
%%time
heights2 = [item.strip('cm') for item in heights]
s2 = pd.Series(heights2).astype('int')

Wall time: 453 ms


.strip method is faster on individual strings in a loop than in pandas Series, for some reason. Type conversion from str to int is still faster with Series.