In [1]:
import requests
import bs4
from bs4 import BeautifulSoup 
import numpy as np
import pandas as pd
from dateutil.parser import parse
from datetime import datetime 

In [3]:

class PlayerInfo:
    """ 
        1. This class fetches basic player information from a 'footballapi' API and web-scrapes
        the current premier league clubs in order to grab premier league player data only from the API
        2. This data is converted into a pandas dataframe to be used for analysis or to be joined with 
        player stats data based on player id
        3. Modifications will be made to allow the user to choose which soccer league player data
           they want to grab
    """
    def __init__(self):
        # dictionary of lists to store data for all players
        self.player_info = {
            'id' : [],
            'name' : [],
            'role' : [],
            'position': [],
            'shirtNum': [],
            'country' : [],
            'club' : [],
            'dob' : []
        }
        # list for relevant club names
        self.clubs = []
        # link for football api
        self.players_url = "https://footballapi.pulselive.com/football/players?pageSize=30&compSeasons=274&altIds=true&page=0&type=player&id=-1&compSeasonId=274"
        # link for current premier league clubs
        self.clubs_url = "https://www.premierleague.com/clubs"
        # initialize pandas DataFrame object
        self.df = pd.DataFrame()
        # web-scrape clubs
        self.get_clubs()
        # fetch player info based on clubs
        self.get_player_info()
        # convert the data from dict to dataframe and clean
        self.convert_to_df()
        
    def get_clubs(self):
        response = requests.get(self.clubs_url)
        soup = BeautifulSoup(response.content,'html.parser')
        for clubName in soup.find_all("h4",{"class":"clubName"}):
            self.clubs.append(clubName.text)
        
    def list_content_empty(self,li):
        """check if the end of json data is reached"""
        if not li['content']:
            return True
        else:
            return False
    
    def get_player_info(self):
        i = 0
        response = dict()
        
        while True:
            if i > 0: # alter GET message to change pages
                self.players_url = self.players_url.replace(f'page={i-1}',f'page={i}')
                response = requests.get(self.players_url).json()
            else:
                response = requests.get(self.players_url).json()
            
            # if no more content
            if self.list_content_empty(response):
                break
            
            try: 
                players = response['content']
                for p in players:
                    try:
                        if p['currentTeam']['name'] in self.clubs:
                            self.player_info['id'].append(int(p['playerId']))
                            self.player_info['name'].append(p['name']['display'])
                            self.player_info['role'].append(p['info']['position'])
                            self.player_info['position'].append(p['info']['positionInfo'])
                            try: 
                                self.player_info['shirtNum'].append(int(p['info']['shirtNum']))
                            except:
                                self.player_info['shirtNum'].append(0)
                            
                            self.player_info['club'].append(p['currentTeam']['name'])
                            self.player_info['country'].append(p['birth']['country']['country'])
                            try:
                                self.player_info['dob'].append(str(parse(p['birth']['date']['label']).date()))
                            except:
                                self.player_info['dob'].append('N/A')
                    except:
                        pass
            except:
                pass
            
            
            i = i + 1
            response = dict()
            
    def convert_to_df(self):
        #convert to pandas dataframe
        self.df = pd.DataFrame.from_dict(self.player_info,orient='columns')
        #use unique player id as index
        self.df.set_index('id',inplace=True)
        #clean some data
        self.df.replace('N/A',np.NaN,inplace=True)
        self.df['shirtNum'].replace(0,np.NaN,inplace=True)
        
pl = PlayerInfo()
pl.df

Unnamed: 0_level_0,club,country,dob,name,position,role,shirtNum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
133868,Norwich City,England,2000-01-04,Max Aarons,Right Full Back,D,2.0
161456,Southampton,England,1996-07-13,Che Adams,Centre Attacking Midfielder,M,
23097,Everton,England,1999-01-02,Dennis Adeniran,Centre Central Midfielder,M,
161346,Liverpool,Spain,1987-01-03,Adrián,Goalkeeper,G,13.0
6009,Manchester City,Argentina,1988-06-02,Sergio Agüero,Centre Striker,F,10.0
50119,Aston Villa,Egypt,1987-09-09,Ahmed El Mohamady,Right Full Back,D,27.0
161559,West Ham United,Switzerland,1997-02-26,Albian Ajeti,Centre Striker,F,27.0
47172,AFC Bournemouth,Netherlands,1995-02-18,Nathan Aké,Left/Centre Central Defender,D,5.0
13981,Leicester City,England,1989-11-18,Marc Albrighton,Left/Right Winger,M,11.0
20234,Tottenham Hotspur,Belgium,1989-03-02,Toby Alderweireld,Centre/Right Central Defender,D,4.0
