In [77]:
import requests
import re
import bs4
from bs4 import BeautifulSoup 
import numpy as np
import pandas as pd
from dateutil.parser import parse
from datetime import datetime 

In [105]:

class PlayerInfo:
    """ 
        1. This class fetches basic player information from a 'footballapi' API and web-scrapes
        the current premier league clubs in order to grab premier league player data only from the API
        2. This data is converted into a pandas dataframe to be used for analysis or to be joined with 
        player stats data based on player id
        3. Modifications will be made to allow the user to choose which soccer league player data
           they want to grab
    """
    def __init__(self):
        # dictionary of lists to store data for all players
        self.player_info = {
            'id' : [],
            'name' : [],
            'role' : [],
            'position': [],
            'shirtNum': [],
            'country' : [],
            'club' : [],
            'dob' : []
        }
        # list for relevant club names
        self.clubs = []
        # link for football api
        self.players_url = "https://footballapi.pulselive.com/football/teams/?/compseasons/274/staff?pageSize=30&compSeasons=274&altIds=true&page=0&type=player"
        # link for current premier league clubs
        self.clubs_url = "https://www.premierleague.com/clubs"
        # initialize pandas DataFrame object
        self.df = pd.DataFrame()
        # web-scrape clubs
        self.get_clubs()
        # fetch player info based on clubs
        self.get_player_info()
        # convert the data from dict to dataframe and clean
        self.convert_to_df()
        
    def get_clubs(self):
        response = requests.get(self.clubs_url)
        soup = BeautifulSoup(response.content,'html.parser')
        clubs_list = bs.find_all("a")
        #extract href links because they contain club number
        clubs_links = [link['href'] for link in clubs_list]
        #narrow down to links that have this regex pattern
        clubs = [link for link in clubs_links if re.search('clubs/[0-9]+',link)]
        #tidy-up club names
        for i in range(len(clubs)):
            clubs[i] = clubs[i].replace('-and-','-&-')
            clubs[i] = clubs[i].replace('-',' ')
        #split link to get club number & club name in tuples
        self.club_and_num = [(i.split("/")[2],i.split("/")[3]) for i in clubs]
    
    def get_player_info(self):
        response = dict()
        # for each club
        for i in range(len(self.club_and_num)):
            response = requests.get(self.players_url.replace(f'teams/?/',f'teams/{self.club_and_num[i][0]}/')).json()
            try: 
                players = response['players']
                #for every player in the club
                for p in players:
                    try:
                        self.player_info['id'].append(int(p['id']))
                        self.player_info['name'].append(p['name']['display'])
                        self.player_info['role'].append(p['info']['position'])
                        self.player_info['position'].append(p['info']['positionInfo'])
                        try: 
                            self.player_info['shirtNum'].append(int(p['info']['shirtNum']))
                        except:
                            self.player_info['shirtNum'].append(0)
                        try:
                            self.player_info['club'].append(self.club_and_num[i][1])
                        except:
                            self.player_info['club'].append('N/A')
                        try:
                            self.player_info['country'].append(p['birth']['country']['country'])
                        except:
                            self.player_info['country'].append('N/A')
                        try:
                            self.player_info['dob'].append(str(parse(p['birth']['date']['label']).date()))
                        except:
                            self.player_info['dob'].append('N/A')
                    except:
                        pass
            except:
                pass
            response = dict()
            
    def convert_to_df(self):
        #convert to pandas dataframe
        self.df = pd.DataFrame.from_dict(self.player_info,orient='columns')
        #use unique player id as index
        self.df.set_index('id',inplace=True)
        #clean some data
        self.df.replace('N/A',np.NaN,inplace=True)
        self.df['shirtNum'].replace(0,np.NaN,inplace=True)
        #capitalize first letter of column names
        self.df.columns = [x.title() for x in self.df.columns]
        self.df.rename(columns={ "club":"Club","country":"Country","dob":"Dob","name":"Name","position":"Position","role":"Role","shirtNum":"Shirt Number"},inplace=True)

In [113]:
pl = PlayerInfo()


In [114]:
pl.df[pl.df['Club']=="Liverpool"]


Unnamed: 0_level_0,Club,Country,Dob,Name,Position,Role,Shirtnum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20559,Liverpool,Brazil,1992-10-02,Alisson,Goalkeeper,G,1.0
14723,Liverpool,Ireland,1998-11-23,Caoimhin Kelleher,Goalkeeper,G,62.0
5140,Liverpool,Netherlands,1991-07-08,Virgil van Dijk,Centre Central Defender,D,4.0
4813,Liverpool,Bosnia And Herzegovina,1989-07-05,Dejan Lovren,Centre Central Defender,D,6.0
10651,Liverpool,England,1997-05-23,Joseph Gomez,Left/Centre/Right Central Defender,D,12.0
10458,Liverpool,Scotland,1994-03-11,Andrew Robertson,Left Full Back,D,26.0
5375,Liverpool,Germany,1991-08-08,Joel Matip,Centre Central Defender,D,32.0
52951,Liverpool,Netherlands,2002-01-18,Ki-Jana Hoever,Centre/Right Full Back,D,51.0
14732,Liverpool,England,1998-10-07,Trent Alexander-Arnold,Right Full Back,D,66.0
11247,Liverpool,Brazil,1993-10-23,Fabinho,Centre Defensive Midfielder,M,3.0
