In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from scipy import stats

import random
import urllib.request
import requests
from bs4 import BeautifulSoup
import warnings

# WebScraper for FIFA 19 data
A WebScraper that scraps data from https://sofifa.com, a site with information and statistic about the footballers in one of the most successful sport simulation games of all times - FIFA. This project is focused on FIFA 19 so the scraper will only scrap data for FIFA 19.


The web pages with the players are accessible via "offset" as a parameter in the link. So lets define our base url get info about the players __ID, Name, Age, Photo, Nationality, Flag, Overall, Potential, Club, Club Logo, Value, Wage and Special__ and put that info into a DataFrame called "basic_player_data.csv".

In [4]:
base_url = "https://sofifa.com/players?offset="
offset = 0
columns = ['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential', 'Club', 
           'Club Logo', 'Value', 'Wage', 'Special']

data = pd.DataFrame(columns=columns)

for offset in range(500): #we will get only the first 500 pages of footballers and it will need a lot of time :(
    url = base_url + str(offset)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text,"html.parser")
    table_body = soup.find('tbody')
    for row in table_body.findAll('tr'):
        td = row.findAll('td')
        picture = td[0].find('img').get('data-src')
        pid = td[0].find('img').get('id')
        nationality = td[1].find('a').get('title')
        flag_img = td[1].find('img').get('data-src')
        name = td[1].findAll('a')[1].text
        age = td[2].find('div').text.strip()
        overall = td[3].text.strip()
        potential = td[4].text.strip()
        club = td[5].find('a').text
        club_logo = td[5].find('img').get('data-src')
        value = td[6].text.strip()
        wage = td[7].text.strip()
        special = td[8].text.strip()
        player_data = pd.DataFrame([[pid, name, age, picture, nationality, flag_img, overall, 
                                  potential, club, club_logo, value, wage, special]])
        player_data.columns = columns
        data = data.append(player_data, ignore_index=True)
    
    offset+=1
    #data.to_csv('data/full_player_data.csv', encoding='utf-8')

In [5]:
data.to_csv('data/basic_player_data.csv', encoding='utf-8')

In [7]:
basic_player_data = pd.read_csv("data/basic_player_data.csv")
basic_player_data.shape

(25500, 14)

We have now a dataset with over 25000 players. Now let scrap more detailed data for each player. We will get the rating in each category like for example: shoot, pace, drible, GK reflexes and etc.
Also we will scrap how good is a player(overall) in a certain position.

We will save the data in a dataset called "player_ratings_data.csv".

In [None]:
player_data_url = 'https://sofifa.com/player/'
master_data = pd.DataFrame()
r = 0
for index, row in data.iterrows():
    skill_names = []
    skill_map = {'ID' : str(row['ID'])}
    url = player_data_url + str(row['ID'])
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text,"html.parser")
    categories = soup.findAll('div', {'class': 'column col-4 mb-20'})
    for category in categories[:-1]:
        skills = category.findAll('li')
        for skill in skills:
            name_value_pair = skill.text.split()
            name_value_pair.reverse()
            value = name_value_pair.pop()
            name_value_pair.reverse()
            n = ' '.join(name_value_pair)
            skill_names.append(n)
            skill_map[str(n)] = value
    attr_data = pd.DataFrame(columns=skill_names)
    for key in skill_map.keys():
        attr_data.loc[r,key] = skill_map[key]
    r = r + 1
    master_data = pd.concat([master_data, attr_data])


In [None]:

master_data.to_csv('data/player_ratings_data.csv', encoding='utf-8')