In [1]:
#RUNNING ALL FUNCTIONS AND IMPORTS
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import numpy as np
from time import sleep
from random import randint

#Making Soup function
def soup_making(url):
    my_page = requests.get(url)
    soup = bs(my_page.text, "html.parser")
    return soup

#getting the text function
def get_text(my_list):
    data = []
    for item in my_list:
        data.append(item.getText())
    return data

#finds all classes for a given tag
def find_all_classes(my_soup,tag):
    tags = my_soup.find_all(tag)
    all_td_classes = set()
    for tag in tags:
        for c in tag.attrs['class']:
            all_td_classes.add(c)
    return list(all_td_classes)

#creating a dict with classes names as Keys and the texts as values
def create_dict(my_soup,classes):
    data_dict = {}
    for item in classes:
        values = get_text(my_soup.find_all('td',{'class':item}))
        data_dict.update({item:values})
    return data_dict

#clean given string
def clean_string(my_list,string):
    data =[]
    for item in my_list:
        a = re.sub(string,'',item)
        data.append(a)
    return data

#remove undesired classes
def remove_from_my_classes(my_classes,classes_to_remove):
    for x in classes_to_remove:
        my_classes.remove(x)
    return my_classes

#creates a list of urls to iterate
def create_urls_list(url,indexes):
    urls_list = []
    for index in indexes:
        current_url = url + str(index)
        urls_list.append(current_url)
    return urls_list


In [2]:
#CREATING all the URLS
#generating url indexes
indexes = np.arange(0,17400,60)
url = 'https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset='
urls = create_urls_list(url,indexes)



In [3]:
#FUNCTION THAT SCRAPES ALL SOFIFA PAGES
def create_final_df(urls):
    df_list = []
    for url in urls:
        my_dict = {}
        soup = soup_making(url)
        my_classes = find_all_classes(soup,'td')
        classes_to_remove = ['col','col-avatar','col-comment','col-name']
        my_classes = remove_from_my_classes(my_classes,classes_to_remove)
        my_dict = create_dict(soup, my_classes)
        #~~~~~~getting team, name and contract
        #CONTRACT
        contract = soup.find_all('div',{'class':'sub'})
        contract = get_text(contract)
        contract = clean_string(contract,'\n*')
        #NAME AND TEAM
        name_team_contract = soup.find_all('div',{'class':'bp3-text-overflow-ellipsis'})
        name_team_contract = get_text(name_team_contract)
        #NAME
        name = name_team_contract[::2]
        name = clean_string(name,'\\d|\~')
        #TEAM
        team = name_team_contract[1::2]
        team = clean_string(team,'\n*|\\d|\~')
        #adding them to the dict
        my_dict['Contract'] = contract
        my_dict['Name'] = name
        my_dict['Team'] = team
        df = pd.DataFrame(my_dict)
        df_list.append(df)
        sleep(randint(2,10))
    return pd.concat(df_list)

final_df = create_final_df(urls)

In [5]:
print(final_df)

    col-pi col-tt col-gu        col-le col-pac col-sho  col-vl col-bp col-wg  \
0   245940   1901     10           N/A      89      71  €12.5M     ST   €17K   
1   232488   1674      6  Jun 30, 2021      68      35  €20.5M     CB   €55K   
2   211302   1762      0           N/A      72      70   €1.8M     ST   €13K   
3   235212   2150      5           N/A      94      71    €56M     RM   €90K   
4   202811   1442      2           N/A      83      83    €33M     GK   €64K   
..     ...    ...    ...           ...     ...     ...     ...    ...    ...   
55  246454   1330     17           N/A      63      27   €550K     CB    €4K   
56  246549   1560      7           N/A      54      41   €750K     CB    €4K   
57  246657    975      7           N/A      71      68   €2.4M     GK    €3K   
58  246667   1515      6           N/A      63      34   €2.3M     CB    €8K   
59  246709   1376      8           N/A      56      31   €675K     CB    €2K   

   col-def col-phy col-ae col-pas col-d

In [4]:
#saving df into a csv datbase
final_df.to_csv("./FIFAPlayersData.csv")

In [18]:
#ANALYSIS OF SOFIFA URLS
#1-https://sofifa.com/players?&showCol%5B %5D=pi&showCol%5B %5D=ae&showCol%5B %5D=oa&showCol%5B %5D=pt&showCol%5B %5D=bp&showCol%5B %5D=gu&showCol%5B%5D=le&showCol%5B%5D=vl&showCol%5B%5D=wg&showCol%5B%5D=tt&showCol%5B%5D=pac&showCol%5B%5D=sho&showCol%5B%5D=pas&showCol%5B%5D=dri&showCol%5B%5D=def&showCol%5B%5D=phy
#2-https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset=60
#3-https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset=120

In [4]:
#TESTING
# url = 'https://sofifa.com/players? showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=oa&showCol%5B3%5D=pt&showCol%5B4%5D=bp&showCol%5B5%5D=gu&showCol%5B6%5D=le&showCol%5B7%5D=vl&showCol%5B8%5D=wg&showCol%5B9%5D=tt&showCol%5B10%5D=pac&showCol%5B11%5D=sho&showCol%5B12%5D=pas&showCol%5B13%5D=dri&showCol%5B14%5D=def&showCol%5B15%5D=phy&offset=6000'
# soup = soup_making(url)

# my_classes = find_all_classes(soup,'td')
# classes_to_remove = ['col','col-avatar','col-comment','col-name']
# my_classes = remove_from_my_classes(my_classes,classes_to_remove)

# my_dict = create_dict(my_classes)

# #~~~~~~getting team, name and contract
# #CONTRACT
# contract = soup.find_all('div',{'class':'sub'})
# contract = get_text(contract)
# contract = clean_string(contract,'\n*')
# #NAME AND TEAM
# name_team_contract = soup.find_all('div',{'class':'bp3-text-overflow-ellipsis'})
# name_team_contract = get_text(name_team_contract)
# #NAME
# name = name_team_contract[::2]
# name = clean_string(name,'\\d|\~')
# #TEAM
# team = name_team_contract[1::2]
# team = clean_string(team,'\n*|\\d|\~')
# #adding them to the dict
# my_dict['Contract'] = contract
# my_dict['Name'] = name
# my_dict['Team'] = team
# #~~~~Creating data frame with all the data
# df = pd.DataFrame(my_dict)
#df.head()

In [17]:
# #TESTING OF DATA SOTRING METHOD USING DF's
# dff_list = []
# d1 = {'key1': 'x1', 'key2': 'y1'}  
# d2 = {'key1': 'x2', 'key2': 'y2'}  
# d3 = {'key1': 'x3', 'key2': 'y3'}  
# d4 = {'key1': 'x4', 'key2': 'y4'}  


# dff = pd.DataFrame(d1,index=[0])
# #print(dff)
# dff_list.append(dff)
# #print(dff_list)

# dff = pd.DataFrame(d2,index=[0])
# #print(dff)
# dff_list.append(dff)
# dff = pd.DataFrame(d3,index=[0])
# dff_list.append(dff)
# dff = pd.DataFrame(d4,index=[0])
# dff_list.append(dff)
# de = pd.concat(dff_list)

# print(de)
# print(type(de))