In [1]:
#RUNNING ALL FUNCTIONS AND IMPORTS
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import numpy as np
from time import sleep
from random import randint

#Making Soup function
def soup_making(url):
    my_page = requests.get(url)
    soup = bs(my_page.text, "html.parser")
    return soup

#getting the text function
def get_text(my_list):
    data = []
    for item in my_list:
        data.append(item.getText())
    return data

#finds all classes for a given tag
def find_all_classes(my_soup,tag):
    tags = my_soup.find_all(tag)
    all_td_classes = set()
    for tag in tags:
        for c in tag.attrs['class']:
            all_td_classes.add(c)
    return list(all_td_classes)

#creating a dict with classes names as Keys and the texts as values
def create_dict(my_soup,classes):
    data_dict = {}
    for item in classes:
        values = get_text(my_soup.find_all('td',{'class':item}))
        data_dict.update({item:values})
    return data_dict

#clean given string
def clean_string(my_list,string):
    data =[]
    for item in my_list:
        a = re.sub(string,'',item)
        data.append(a)
    return data

#remove undesired classes
def remove_from_my_classes(my_classes,classes_to_remove):
    for x in classes_to_remove:
        my_classes.remove(x)
    return my_classes

#creates a list of urls to iterate
def create_urls_list(url,indexes):
    urls_list = []
    for index in indexes:
        current_url = url + str(index)
        urls_list.append(current_url)
    return urls_list


In [2]:
#CREATING all the URLS
#generating url indexes
indexes = np.arange(0,17400,60)
url = 'https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset='
urls = create_urls_list(url,indexes)



In [3]:
#FUNCTION THAT SCRAPES ALL SOFIFA PAGES
def create_final_df(urls):
    df_list = []
    for url in urls:
        my_dict = {}
        soup = soup_making(url)
        my_classes = find_all_classes(soup,'td')
        classes_to_remove = ['col','col-avatar','col-comment','col-name']
        my_classes = remove_from_my_classes(my_classes,classes_to_remove)
        my_dict = create_dict(soup, my_classes)
        #~~~~~~getting team, name and contract
        #CONTRACT
        contract = soup.find_all('div',{'class':'sub'})
        contract = get_text(contract)
        contract = clean_string(contract,'\n*')
        #NAME AND TEAM
        name_team_contract = soup.find_all('div',{'class':'bp3-text-overflow-ellipsis'})
        name_team_contract = get_text(name_team_contract)
        #NAME
        name = name_team_contract[::2]
        name = clean_string(name,'\\d|\~')
        #TEAM
        team = name_team_contract[1::2]
        team = clean_string(team,'\n*|\\d|\~')
        #adding them to the dict
        my_dict['Contract'] = contract
        my_dict['Name'] = name
        my_dict['Team'] = team
        df = pd.DataFrame(my_dict)
        df_list.append(df)
        sleep(randint(2,10))
    return pd.concat(df_list)

final_df = create_final_df(urls)

In [16]:
print(final_df)

   col-pt col-pas col-bp        col-le col-gu col-vl col-sho col-tt col-pac  \
0      81      64     ST  Jun 30, 2021      7  €8.5M      73   1873      82   
1      71      61     ST           N/A      2     €0      67   1716      73   
2      84      69    CAM           N/A     10  €9.5M      64   1873      79   
3      75      57     GK           N/A      4    €2M      66   1141      78   
4      83      79    CAM           N/A      5   €21M      71   2052      74   
..    ...     ...    ...           ...    ...    ...     ...    ...     ...   
55     73      60     ST           N/A      5  €1.8M      69   1725      77   
56     69      58    CDM           N/A      3  €1.1M      50   1740      70   
57     70      57    CDM           N/A     10  €550K      46   1586      52   
58     76      65    CAM           N/A     16  €650K      51   1547      65   
59     70      40     CB           N/A      4  €1.1M      25   1384      53   

   col-oa  col-pi col-def col-phy col-wg col-ae col

In [20]:
#saving df into a csv datbase
final_df.to_csv("./FIFAPlayersAnalisis/FIFAPlayersData.csv")

In [18]:
#ANALYSIS OF SOFIFA URLS
#1-https://sofifa.com/players?&showCol%5B %5D=pi&showCol%5B %5D=ae&showCol%5B %5D=oa&showCol%5B %5D=pt&showCol%5B %5D=bp&showCol%5B %5D=gu&showCol%5B%5D=le&showCol%5B%5D=vl&showCol%5B%5D=wg&showCol%5B%5D=tt&showCol%5B%5D=pac&showCol%5B%5D=sho&showCol%5B%5D=pas&showCol%5B%5D=dri&showCol%5B%5D=def&showCol%5B%5D=phy
#2-https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset=60
#3-https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset=120

In [4]:
#TESTING
# url = 'https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset=6000'
# soup = soup_making(url)

# my_classes = find_all_classes(soup,'td')
# classes_to_remove = ['col','col-avatar','col-comment','col-name']
# my_classes = remove_from_my_classes(my_classes,classes_to_remove)

# my_dict = create_dict(my_classes)

# #~~~~~~getting team, name and contract
# #CONTRACT
# contract = soup.find_all('div',{'class':'sub'})
# contract = get_text(contract)
# contract = clean_string(contract,'\n*')
# #NAME AND TEAM
# name_team_contract = soup.find_all('div',{'class':'bp3-text-overflow-ellipsis'})
# name_team_contract = get_text(name_team_contract)
# #NAME
# name = name_team_contract[::2]
# name = clean_string(name,'\\d|\~')
# #TEAM
# team = name_team_contract[1::2]
# team = clean_string(team,'\n*|\\d|\~')
# #adding them to the dict
# my_dict['Contract'] = contract
# my_dict['Name'] = name
# my_dict['Team'] = team
# #~~~~Creating data frame with all the data
# df = pd.DataFrame(my_dict)
#df.head()

In [17]:
# #TESTING OF DATA SOTRING METHOD USING DF's
# dff_list = []
# d1 = {'key1': 'x1', 'key2': 'y1'}  
# d2 = {'key1': 'x2', 'key2': 'y2'}  
# d3 = {'key1': 'x3', 'key2': 'y3'}  
# d4 = {'key1': 'x4', 'key2': 'y4'}  


# dff = pd.DataFrame(d1,index=[0])
# #print(dff)
# dff_list.append(dff)
# #print(dff_list)

# dff = pd.DataFrame(d2,index=[0])
# #print(dff)
# dff_list.append(dff)
# dff = pd.DataFrame(d3,index=[0])
# dff_list.append(dff)
# dff = pd.DataFrame(d4,index=[0])
# dff_list.append(dff)
# de = pd.concat(dff_list)

# print(de)
# print(type(de))