# Data extraction

## FIFA teams web scraping

Web scraping of <a href='www.fifaindex.com/es/teams'>FIFAindex</a> in order to find information on each team:
* Team name
* League
* Rival team
* Attack
* Midfield
* Defence 
* Transfer Budget

### First step
The site www.FIFAindex.com/es/teams was scraped to find the links to each team description. In order to to so, Chrome WebDriver was used to iterate through the several pages containing the links to each team details.

In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
url = 'https://www.fifaindex.com/teams/'
browser.visit(url)

In [4]:
links = []
for page in range(25):
    html = browser.html
    soup = bs(html, 'html.parser')
    
    teams = soup.find_all('a', class_="link-team")
    
    for team in teams:
        link = team['href']
        if link not in links:
            links.append(link)
    
    print(f"Page {page} complete")

    # Click the 'Next' button on each page
    try:
        browser.click_link_by_partial_text('Next Page')
          
    except:
        print("Scraping Complete")
        break
        

teams_url_list = ['http://www.fifaindex.com' + link for link in links]

Page 0 complete
Page 1 complete
Page 2 complete
Page 3 complete
Page 4 complete
Page 5 complete
Page 6 complete
Page 7 complete
Page 8 complete
Page 9 complete
Page 10 complete
Page 11 complete
Page 12 complete
Page 13 complete
Page 14 complete
Page 15 complete
Page 16 complete
Page 17 complete
Page 18 complete
Page 19 complete
Page 20 complete
Page 21 complete
Scraping Complete


In [5]:
print(f"Total number of links obtained: {len(links)}\n")
print("The first 5 links are:")
for x in range(5):
    print(teams_url_list[x])


Total number of links obtained: 652

The first 5 links are:
http://www.fifaindex.com/es/team/241/fc-barcelona/
http://www.fifaindex.com/es/team/243/real-madrid/
http://www.fifaindex.com/es/team/45/juventus/
http://www.fifaindex.com/es/team/10/manchester-city/
http://www.fifaindex.com/es/team/21/fc-bayern/


### Second step
Each collected url was scraped in order to obtained the details of the teams

*Trial with one team to observe the HTML structure*

In [6]:
url = 'https://www.fifaindex.com/es/team/241/fc-barcelona/'
response = requests.get(url)
soup = bs(response.text, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html lang="es">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="/favicon.ico" rel="icon"/>
  <link href="/ar/team/241/fc-barcelona/" hreflang="ar" rel="alternate"/>
  <link href="/cs/team/241/fc-barcelona/" hreflang="cs" rel="alternate"/>
  <link href="/da/team/241/fc-barcelona/" hreflang="da" rel="alternate"/>
  <link href="/de/team/241/fc-barcelona/" hreflang="de" rel="alternate"/>
  <link href="/team/241/fc-barcelona/" hreflang="en" rel="alternate"/>
  <link href="/es/team/241/fc-barcelona/" hreflang="es" rel="alternate"/>
  <link href="/es-mx/team/241/fc-barcelona/" hreflang="es-mx" rel="alternate"/>
  <link href="/fr/team/241/fc-barcelone/" hreflang="fr" rel="alternate"/>
  <link href="/it/team/241/fc-barcellona/" hreflang="it" rel="alternate"/>
  <link href="/ja/team/241/fc%EF%BE%8A%EF%BE%9E%EF%BE%99%EF%BD%BE%EF%BE%9B%EF%BE%85/" hreflan

Loop through all url's and create a dictionary with the information, in order to later convert it to pandas

In [7]:
list_of_dict = []
j=1
for link in teams_url_list:
    url = link
    response = requests.get(url)
    soup = bs(response.text, 'lxml')
    
    team_info = soup.find('div', class_='pl-3')
    team_name = team_info.h1.text
    team_league = team_info.h2.text
    
    results = soup.find_all('li', class_ = 'list-group-item')
    
    for i in range(len(results)): 
        item = results[i].getText(separator='/')
        descriptor = item[:item.find('/')]
        if descriptor == 'Equipo Rival':
            equipo_rival = item[item.find('/')+1:]
        elif descriptor == 'Ataque':
            ataque = item[item.find('/')+1:]
        elif descriptor == 'Mediocampo':
            mediocampo = item[item.find('/')+1:]
        elif descriptor == 'Defensa':
            defensa = item[item.find('/')+1:]
        else:
            descriptor = item[:item.find('\n')]
            if descriptor == 'Presupuesto De Traspasos':
                presupuesto_de_traspasos_euro = item[item.find('€')+1:item.find('$')-3]
    
    miDict = {'team_name': team_name,
             'team_league': team_league,
             'rival_team': equipo_rival,
             'attack': ataque,
             'midfield': mediocampo,
             'defence': defensa,
             'transfer_budget': presupuesto_de_traspasos_euro,}
    
    list_of_dict.append(miDict)
    print(f"Link {j} completed")
    j = j+ 1
    


Link 1 completed
Link 2 completed
Link 3 completed
Link 4 completed
Link 5 completed
Link 6 completed
Link 7 completed
Link 8 completed
Link 9 completed
Link 10 completed
Link 11 completed
Link 12 completed
Link 13 completed
Link 14 completed
Link 15 completed
Link 16 completed
Link 17 completed
Link 18 completed
Link 19 completed
Link 20 completed
Link 21 completed
Link 22 completed
Link 23 completed
Link 24 completed
Link 25 completed
Link 26 completed
Link 27 completed
Link 28 completed
Link 29 completed
Link 30 completed
Link 31 completed
Link 32 completed
Link 33 completed
Link 34 completed
Link 35 completed
Link 36 completed
Link 37 completed
Link 38 completed
Link 39 completed
Link 40 completed
Link 41 completed
Link 42 completed
Link 43 completed
Link 44 completed
Link 45 completed
Link 46 completed
Link 47 completed
Link 48 completed
Link 49 completed
Link 50 completed
Link 51 completed
Link 52 completed
Link 53 completed
Link 54 completed
Link 55 completed
Link 56 completed
L

Link 438 completed
Link 439 completed
Link 440 completed
Link 441 completed
Link 442 completed
Link 443 completed
Link 444 completed
Link 445 completed
Link 446 completed
Link 447 completed
Link 448 completed
Link 449 completed
Link 450 completed
Link 451 completed
Link 452 completed
Link 453 completed
Link 454 completed
Link 455 completed
Link 456 completed
Link 457 completed
Link 458 completed
Link 459 completed
Link 460 completed
Link 461 completed
Link 462 completed
Link 463 completed
Link 464 completed
Link 465 completed
Link 466 completed
Link 467 completed
Link 468 completed
Link 469 completed
Link 470 completed
Link 471 completed
Link 472 completed
Link 473 completed
Link 474 completed
Link 475 completed
Link 476 completed
Link 477 completed
Link 478 completed
Link 479 completed
Link 480 completed
Link 481 completed
Link 482 completed
Link 483 completed
Link 484 completed
Link 485 completed
Link 486 completed
Link 487 completed
Link 488 completed
Link 489 completed
Link 490 com

In [27]:
teams = pd.DataFrame(list_of_dict)
teams.head()

Unnamed: 0,attack,defence,midfield,rival_team,team_league,team_name,transfer_budget
0,87,85,86,Real Madrid,LaLiga Santander,FC Barcelona,188.000.000
1,83,86,88,FC Barcelona,LaLiga Santander,Real Madrid,188.500.000
2,89,85,84,Inter de Milán,Serie A TIM,Juventus,90.000.000
3,86,83,88,Manchester Utd,Premier League,Manchester City,170.000.000
4,85,85,85,Bor. Dortmund,Bundesliga,FC Bayern,100.000.000


# Data Transformation
Transform and normalize scrapped data

## FIFA teams data exploration and re-arrangement

In [28]:
teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 7 columns):
attack             652 non-null object
defence            652 non-null object
midfield           652 non-null object
rival_team         652 non-null object
team_league        652 non-null object
team_name          652 non-null object
transfer_budget    652 non-null object
dtypes: object(7)
memory usage: 35.7+ KB


In [29]:
print(f"Number of teams: {teams['team_name'].nunique()}")
print(f"Number of leagues: {teams['team_league'].nunique()}")

Number of teams: 651
Number of leagues: 38


There seems to be a duplicated team

In [30]:
teams.loc[teams['team_name'].duplicated(keep=False)]

Unnamed: 0,attack,defence,midfield,rival_team,team_league,team_name,transfer_budget
106,74,72,75,Guadalajara,LIGA Bancomer MX,América,25.000.000
211,71,71,72,Atlético Mineiro,Liga do Brasil,América,750.000


There are two teams with the same name, but they are not duplicated

In [31]:
teams['team_id'] = teams.index+1
teams = teams[['team_id', 'team_name', 'team_league', 'rival_team', 'attack', 'midfield', 'defence', 'transfer_budget']]
teams.head()

Unnamed: 0,team_id,team_name,team_league,rival_team,attack,midfield,defence,transfer_budget
0,1,FC Barcelona,LaLiga Santander,Real Madrid,87,86,85,188.000.000
1,2,Real Madrid,LaLiga Santander,FC Barcelona,83,88,86,188.500.000
2,3,Juventus,Serie A TIM,Inter de Milán,89,84,85,90.000.000
3,4,Manchester City,Premier League,Manchester Utd,86,88,83,170.000.000
4,5,FC Bayern,Bundesliga,Bor. Dortmund,85,85,85,100.000.000


Replace the name of the rial team for its team_id

In [32]:
teams_dict = teams[['team_name','team_id']].set_index('team_name')
teams_dict = teams_dict.to_dict()['team_id']
teams = teams.rename(columns={'rival_team':'rival_team_id'}).replace({'rival_team_id':teams_dict})
teams.head()

Unnamed: 0,team_id,team_name,team_league,rival_team_id,attack,midfield,defence,transfer_budget
0,1,FC Barcelona,LaLiga Santander,2,87,86,85,188.000.000
1,2,Real Madrid,LaLiga Santander,1,83,88,86,188.500.000
2,3,Juventus,Serie A TIM,13,89,84,85,90.000.000
3,4,Manchester City,Premier League,12,86,88,83,170.000.000
4,5,FC Bayern,Bundesliga,15,85,85,85,100.000.000


### Add the country of the team and replace the the team_league by the league_id

In [33]:
leagues = pd.DataFrame(teams['team_league'].unique())
leagues = leagues.rename(columns={0:'league_name'})
leagues['league_id'] = leagues.index+1
leagues = leagues.set_index('league_name')
leagues_dict = leagues.to_dict()['league_id'] # This dictionary will be later used to change the team_league for the league_id

leagues=leagues.reset_index()

leagues = leagues[['league_id', 'league_name']]
leagues.head()


Unnamed: 0,league_id,league_name
0,1,LaLiga Santander
1,2,Serie A TIM
2,3,Premier League
3,4,Bundesliga
4,5,Ligue 1 Conforama


In [34]:
league_country_dict = {'LaLiga Santander': 'Spain', 'Serie A TIM': 'Italy', 'Premier League': 'England',
                        'Bundesliga': 'Germany', 'Ligue 1 Conforama': 'France', 'Liga NOS': 'Portugal', 
                        'Eredivisie': 'Netherlands', 'Süper Lig': 'Turkey', 'SAF': 'Argentina', 'Resto del mundo': '', 
                        'Bundesliga 2': 'Germany', 'LIGA Bancomer MX': 'Mexico', 'Pro League': 'Belgium', 
                        'Liga do Brasil': 'Brasil', 'Saudi Professional League': 'Saudi Arabia', 'Ö. Bundesliga': 'Austria', 
                        'Scottish Prem': 'Scotland', 'LaLiga 1 I 2 I 3': 'Spain', 'EFL Championship': 'England',
                        'RSL': 'Switzerland', 'MLS': 'United States', 'Liga Dimayor': 'Colombia', 'Camp. Scotiabank': 'Chile', 
                        'CSL': 'China', "Domino's Ligue 2": 'France', 'Calcio B': 'Italy', 'Superliga': 'Denmark', 
                        'Meiji Yasuda J1': 'Japan', 'Allsvenskan': 'Sweden', 'Eliteserien': 'Norway', 'Ekstraklasa': 'Poland',
                        'K-League 1': 'South Korea', 'EFL League One': 'England', 'Hyundai A-League': 'Australia', 
                        '3. Liga': 'Germany', 'EFL League Two': 'England', 'SSE Airtricity Lge': 'Ireland', 'Agentes Libres': ''}

In [35]:
teams['country'] = teams['team_league']
teams = teams.replace({'country':league_country_dict})


countries = pd.DataFrame(teams['country'].unique()).rename(columns={0:'country_name'})
countries['country_id'] = countries.index+1
countries = countries.set_index('country_name')
countries_dict = countries.to_dict()['country_id']
countries_dict # This dictionary will be used to change the country for the country_id
countries = countries.reset_index()
countries = countries[['country_id', 'country_name']]
countries.head()


Unnamed: 0,country_id,country_name
0,1,Spain
1,2,Italy
2,3,England
3,4,Germany
4,5,France


Handle exceptions

In [36]:
teams.loc[teams['team_name']=='Montreal Impact']=teams.loc[teams['team_name']=='Montreal Impact'].replace({'country':{'United States':'Canada'}})
teams.loc[teams['team_name']=='Toronto FC']=teams.loc[teams['team_name']=='Toronto FC'].replace({'country':{'United States':'Canada'}})
teams.loc[teams['team_name']=='Whitecaps FC']=teams.loc[teams['team_name']=='Whitecaps FC'].replace({'country':{'United States':'Canada'}})
teams.loc[teams['team_name']=='AS Monaco']=teams.loc[teams['team_name']=='AS Monaco'].replace({'country':{'France':'Monaco'}})

Replace country name and league name by country_id and league_id

In [20]:
teams = teams.rename(columns={'country':'country_id', 'team_league':'league_id'})\
                    .replace({'country_id':countries_dict, 'league_id':leagues_dict})

In [21]:
leagues['main_country_id'] = leagues['league_name']
leagues = leagues.replace({'main_country_id':league_country_dict}).replace({'main_country_id':countries_dict})

leagues.head()

Unnamed: 0,league_id,league_name,main_country_id
0,1,LaLiga Santander,1
1,2,Serie A TIM,2
2,3,Premier League,3
3,4,Bundesliga,4
4,5,Ligue 1 Conforama,5


## Resulting tables

In [37]:
teams.head()

Unnamed: 0,team_id,team_name,team_league,rival_team_id,attack,midfield,defence,transfer_budget,country
0,1,FC Barcelona,LaLiga Santander,2,87,86,85,188.000.000,Spain
1,2,Real Madrid,LaLiga Santander,1,83,88,86,188.500.000,Spain
2,3,Juventus,Serie A TIM,13,89,84,85,90.000.000,Italy
3,4,Manchester City,Premier League,12,86,88,83,170.000.000,England
4,5,FC Bayern,Bundesliga,15,85,85,85,100.000.000,Germany


In [23]:
leagues.head()

Unnamed: 0,league_id,league_name,main_country_id
0,1,LaLiga Santander,1
1,2,Serie A TIM,2
2,3,Premier League,3
3,4,Bundesliga,4
4,5,Ligue 1 Conforama,5


In [24]:
countries.head()

Unnamed: 0,country_id,country_name
0,1,Spain
1,2,Italy
2,3,England
3,4,Germany
4,5,France


In [38]:
teams.to_csv("../Resources/teams.csv", index=False, encoding='UTF-8')
leagues.to_csv("../Resources/leagues.csv", index=False, encoding='UTF-8')
countries.to_csv("../Resources/countries.csv", index=False, encoding='UTF-8')          