In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def fetch_page(url):
    """ Fetch the HTML content of a page """
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        return None

def parse_table(soup):
    """ Parse the HTML soup to extract table data """
    data = []
    table = soup.find('table')
    headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
    
    for row in table.find('tbody').find_all('tr'):
        cols = row.find_all('td')
        print(cols)
        row_data = [col.get_text(strip=True) for col in cols]
        data.append(row_data)
    
    return pd.DataFrame(data, columns=headers)

def get_data_from_url(base_url, start_page, end_page):
    all_data = pd.DataFrame()
    for page_number in range(start_page, end_page):
        url = f"{base_url}/{page_number}"
        soup = fetch_page(url)
        if soup:
            new_data = parse_table(soup)
            print(new_data.head())
            all_data = pd.concat([all_data, new_data], ignore_index=True)
        else:
            print(f"Failed to fetch data from {url}")
    return all_data

# URL setup
base_url = 'https://www.footballtransfers.com/nl/spelers/europa'
start_page = 1
end_page = 50  # you can adjust this based on how many pages you need to scrape

# Get the data
data = get_data_from_url(base_url, start_page, end_page)

[<td class="td-skill">
<div class="table-skill">
<div class="table-skill__indicator" style="background-color: #EBECEF;"></div>
<div class="table-skill__skill-pot">
<span class="placeholder-skill__skill"></span>
<span class="placeholder-skill__pot"></span>
</div>
</div> </td>, <td>
<div aria-hidden="true" class="d-flex align-items-center">
<div class="player-image-holder">
<i class="plyr-prfl-img"><span class="player-image"></span></i>
<figure class="small-icon-image"><span class="player-country-image"></span></figure>
</div>
<div class="text">
<span class="player-name"></span>
<span class="player-sub-text"></span>
</div>
</div> </td>, <td class="m-hide age">
<span class="player-age"></span> </td>, <td class="m-hide" scope="row">
<div class="d-flex justify-center align-items-center">
<span class="team-image"></span>
<span class="team-name"></span>
</div> </td>, <td class="text-center">
<div class="d-flex justify-content-center">
<span class="player-value-placeholder"></span> </div>
</td

In [3]:
get_data_from_url

<function __main__.get_data_from_url(base_url, start_page, end_page)>

In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

def setup_driver():
    """ Set up the Selenium WebDriver """
    options = webdriver.ChromeOptions()
    options.headless = True  # Run in headless mode
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def fetch_data_with_selenium(url):
    driver = setup_driver()
    driver.get(url)
    time.sleep(5)  # Wait for JavaScript to load the data

    # Extract data using Selenium
    data = []
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, 'td')
        row_data = [col.text for col in cols if col.text != '']  # Get text, skip empty
        if row_data:
            data.append(row_data)
    
    driver.quit()
    return pd.DataFrame(data, columns=['Skill/Potential', 'Player', 'Age', 'Team', 'ETV'])

# URL setup
url = 'https://www.footballtransfers.com/nl/spelers/nl-eredivisie'
data = fetch_data_with_selenium(url)

   Skill/Potential                                Player Age       Team  \
0       70.6\n85.0              Johan Bakayoko\nAM (CRL)  20        PSV   
1       78.1\n78.9        Hirving Lozano\nAM (RL), M (R)  28        PSV   
2       69.8\n78.8             Santiago Gimenez\nA (CRL)  22  Feyenoord   
3       71.5\n82.5           Quilindschy Hartman\nV (CL)  22  Feyenoord   
4       72.7\n81.2          Sergiño Dest\nV (RL), AM (R)  23        PSV   
5       69.0\n79.7                  Jordan Teze\nV (CRL)  24        PSV   
6       67.7\n72.6              Noa Lang\nAM (L), A (RL)  24        PSV   
7       67.6\n74.5                  Dávid Hancko\nV (CL)  26  Feyenoord   
8       69.9\n78.7  Lutsharel Geertruida\nV (CR), VM (C)  23  Feyenoord   
9       69.6\n75.2          Joey Veerman\nVM (L), M, AMC  25        PSV   
10      75.3\n77.1             Steven Bergwijn\nAM (CRL)  26       Ajax   
11      55.2\n87.1                   Jorrel Hato\nV (CL)  18       Ajax   
12      66.1\n74.4       

In [26]:
def fetch_data_with_selenium(base_url, total_pages):
    driver = setup_driver()
    all_data = pd.DataFrame()

    for page in range(1, total_pages + 1):
        # Construct the URL for each page
        url = f"{base_url}/{page}"
        driver.get(url)
        time.sleep(5)  # Adjust timing based on your observation
        
        # Extract data using Selenium
        data = []
        rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
        for row in rows:
            cols = row.find_elements(By.TAG_NAME, 'td')
            row_data = [col.text for col in cols if col.text != '']  # Get text, skip empty
            if row_data:
                data.append(row_data)
        
        # Convert page data to DataFrame and append to all_data
        page_data = pd.DataFrame(data, columns=['Skill/Potential', 'Player', 'Age', 'Team', 'ETV'])
        all_data = pd.concat([all_data, page_data], ignore_index=True)

    driver.quit()
    return all_data

# URL and pages setup
base_url = 'https://www.footballtransfers.com/nl/spelers/nl-eredivisie'
total_pages = 50  # Total number of pages you need to scrape

# Fetch all data
data = fetch_data_with_selenium(base_url, total_pages)
print(data)

    Skill/Potential                          Player Age              Team  \
0        70.6\n85.0        Johan Bakayoko\nAM (CRL)  20               PSV   
1        78.1\n78.9  Hirving Lozano\nAM (RL), M (R)  28               PSV   
2        69.8\n78.8       Santiago Gimenez\nA (CRL)  22         Feyenoord   
3        71.5\n82.5     Quilindschy Hartman\nV (CL)  22         Feyenoord   
4        72.7\n81.2    Sergiño Dest\nV (RL), AM (R)  23               PSV   
..              ...                             ...  ..               ...   
441      36.5\n36.5        Ilias Takidine\nAM (CRL)  23          Waalwijk   
442        0.0\n0.0                Mats Rots\nV (C)  18            Twente   
443      43.8\n43.8             Mike Hauptmeijer\nK  27        PEC Zwolle   
444      42.8\n42.8                Pascal Kuiper\nK  20         Excelsior   
445      32.6\n52.4         Sergi Rosanas\nM, V (R)  23  Sparta Rotterdam   

        ETV  
0    €63.2M  
1    €34.6M  
2    €31.1M  
3    €29.8M  
4    

In [30]:
data[['Skill', 'Potential']] = data['Skill/Potential'].str.split('\n', expand=True)

# Split 'Player' into 'Name' and 'Position'
data[['Name', 'Position']] = data['Player'].str.split('\n', expand=True)

# Drop the original 'Skill/Potential' and 'Player' columns
data.drop(columns=['Skill/Potential', 'Player'], inplace=True)

# Show the updated DataFrame
data.head()

Unnamed: 0,Age,Team,ETV,Skill,Potential,Name,Position
0,20,PSV,€63.2M,70.6,85.0,Johan Bakayoko,AM (CRL)
1,28,PSV,€34.6M,78.1,78.9,Hirving Lozano,"AM (RL), M (R)"
2,22,Feyenoord,€31.1M,69.8,78.8,Santiago Gimenez,A (CRL)
3,22,Feyenoord,€29.8M,71.5,82.5,Quilindschy Hartman,V (CL)
4,23,PSV,€28.8M,72.7,81.2,Sergiño Dest,"V (RL), AM (R)"


In [31]:
data.to_csv('data/football_data_preprocessed.csv', index=False)

In [105]:
from googlesearch import search

data['instagram_url'] = None

def find_instagram_url(player_name):
    # Query Google for the Instagram page of the player
    query = f"{player_name} Instagram site:instagram.com"
    try:
        for j in search(query, num_results=1, sleep_interval=5):
            print(j)
            return j  # Return the first result that is an Instagram link
    except Exception as e:
        print(e)
        print(e.__hash__)

def extract_username(instagram_url):
    # Extract the username from the Instagram URL
    if 'instagram.com' in instagram_url:
        username = instagram_url.split('/')[-1]  # Assuming the URL is well-formed
        if '?' in username:
            username = username.split('?')[0]  # Clean up URL parameters if any
        return username
    return None

In [145]:
data = pd.read_csv('data/football_data_with_instagram.csv')
none_rows = data[data.isna().any(axis=1)]

for index, row in none_rows.iterrows():
    instagram_url = find_instagram_url(row['Name'])
    data.at[index, 'instagram_url'] = instagram_url

data.to_csv('data/football_data_with_instagram.csv', index=False)


429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DJoshua%252BKitolano%252B%252BInstagram%252Bsite%253Ainstagram.com%26num%3D3%26hl%3Den%26start%3D0&hl=en&q=EgQfiZyWGLaq9bAGIjC6lpNRJKmivFkbfEH-Ysu27o141I_9-0cLSXbW2cqLTFL0J7YOAbQsljfLtTfBKg8yAXJaAUM
<method-wrapper '__hash__' of HTTPError object at 0x17be1a8c0>
429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DSam%252BLammers%252BInstagram%252Bsite%253Ainstagram.com%26num%3D3%26hl%3Den%26start%3D0&hl=en&q=EgQfiZyWGLeq9bAGIjD1EGeZ6TCpPGKNxTJdesGNCvtbqLjqjzuD8Y1msAORi-8B1rBWtBIZtbYuNVqr0owyAXJaAUM
<method-wrapper '__hash__' of HTTPError object at 0x17be1a8c0>
429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DKristian%252BHlynsson%252BInstagram%252Bsite%253Ainstagram.com%26num%3D3%26hl%3Den%26start%3D0&hl=en&q=EgQfiZyWG

KeyboardInterrupt: 