# Scraping MLSSoccer.com

The following notebook was capable of scraping the historical season data stored by MLSSoccer.com for the majority of active players in the league as the website appeared on April 24th, 2018

In [None]:
import requests
import csv
import re
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

### Grab All Team Links

In [None]:
url = "https://www.mlssoccer.com/rosters/2018"
response2 = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "html5lib")

In [None]:
teams = soup.optgroup
team_links = []
for option in teams:
    if (option["value"][-1] == "8"):
        pass
    else:
        team_links.append("https://www.mlssoccer.com/rosters/2018/" + option["value"].split("/")[-1])

In [None]:
# Occassionally, URL's grabbed from the roster page do not properly link to player's page
# Create a list of them as they are discovered to prevent further breaking of program
exclude_list = ["https://www.mlssoccer.com/players/Shaft-Brewer", "https://www.mlssoccer.com/players/Maximiano", "https://www.mlssoccer.com/players/Bertrand-Owundi-Eko",
               "https://www.mlssoccer.com/players/Zakaria-Diallo", "https://www.mlssoccer.com/players/Thomas-Meilleur-Giguere",
               "https://www.mlssoccer.com/players/Antonio-Delamea-Mlinar", "https://www.mlssoccer.com/players/Mark-Segbers",
               "https://www.mlssoccer.com/players/Ismael--Tajouri-Shradi", "https://www.mlssoccer.com/players/Earl-Edwards-Jr.",
               "https://www.mlssoccer.com/players/Amro-Tarek", "https://www.mlssoccer.com/players/Eryk-Williamson",
               "https://www.mlssoccer.com/players/Paul-Marie", "https://www.mlssoccer.com/players/Danny-Musovski",
               "https://www.mlssoccer.com/players/Mohamed-Thiaw", "https://www.mlssoccer.com/players/Aiden-Daniels",
               "https://www.mlssoccer.com/players/Simon-Colyn", "https://www.mlssoccer.com/players/Justin-Fiddes",
               "https://www.mlssoccer.com/players/David-Norman-Jr."]

In [None]:
stats = [["Player", "Position", "Year", "Club", "GP", "GS", "G", "MINS", "A", "SHTS", "SOG", "FC", "OFF", "Y", "R"]]
keepers = [["Player", "Position", "Year", "Club", "GP", "GS", "MINS", "SHO", "SV", "GA", "PKG", "PKA", "W", "L", "T"]]

for team in team_links:
    sleep(1)
    player_links = []
    response_team = requests.get(team)
    team_page = response_team.text
    team_soup = BeautifulSoup(team_page, "html5lib")
    roster = team_soup.table
    players = [temp for temp in roster.find_all('a')]
    player_links = []
    for player in players:
        player_links.append("https://www.mlssoccer.com" + player["href"])
        
    for link in player_links:
        link = re.sub(r'\s', '-', link)
        if (link in exclude_list):
            continue
        sleep(2)
        player_response = requests.get(link)
        player_page = player_response.text
        soup = BeautifulSoup(player_page, "html5lib")
        name = (soup.find("title").text.split("|")[0])
        pos = soup.find('span', {"class": "position"}).text
        row = [name, pos]
        for td in soup.find_all("td"):
            if (re.match("<td>.*?</td>", str(td))):
                continue
            if (td["data-title"] == "Date"):
                break
            row.append(td.text)
            if (len(row) == 15):
                if (pos == "Goalkeeper"):
                    keepers.append(row)
                else:
                    stats.append(row)
                row = [name, pos]

Run the following codeblock if an error with a scraped URL occurs to determine where it occurred

In [None]:
print (td)
print (link)
print (team)

In [None]:
#Write data on field players and goalkeepers to separate .csv files

with open("output.csv", "w", newline = "") as out_file:
    writer = csv.writer(out_file)
    writer.writerows(stats)

with open("keepers.csv", "w", newline = "") as keep_outfile:
    writer_keep = csv.writer(keep_outfile)
    writer_keep.writerows(keepers)