In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
import pandas as pd
from dotenv import load_dotenv
import pandas as pd
import os
import google.generativeai as genai
import numpy as np
from scipy.spatial.distance import cosine

In [2]:
webdriver_path = r'C:\Windows\chromedriver.exe'
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_service = ChromeService(executable_path= webdriver_path)
driver = webdriver.Chrome(options= chrome_options, service = chrome_service)


In [3]:
def clean_ratings(element):
    numbers = [float(num) for num in element.strip().split('\n') if num.strip()]
    return numbers[0], numbers[1]

In [4]:
def clean_transfer_value(element):
    value = element.replace('€', '').replace('M', '')
    return float(value)

In [5]:
def clean_player_info(element):
    lines = [line.strip() for line in element.split('\n') if line.strip()]
    name = lines[0].replace('\xa0', ' ').strip()
    return name

In [18]:
url = "https://www.footballtransfers.com/us/values/players/most-valuable-soccer-players/"
data = []
titles = ['Skill', 'Pot', 'Rank', 'Player', 'Age', 'Team', 'Transfer_Value']
for i in range(1, 26):
    page_url = url + str(i)
    driver.get(page_url)
    print("Scraping data from page " + str(i) + "...")
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')[2:]
    dataframe = pd.DataFrame(columns=titles)
    for row in rows:
        cells = row.find_all('td')
        row_data = [cell.text for cell in cells]
        # Ensure the row has the correct number of cells before processing
        if len(row_data) >= 6 and row_data[1] != '\n':
            try:
                skill, pot = clean_ratings(row_data[0])
                rank = int(row_data[1])
                name = clean_player_info(row_data[2])
                age = int(row_data[3])
                team = row_data[4].strip()
                transfer_value = clean_transfer_value(row_data[5])
                data.append({
                    'Skill': skill,
                    'Pot': pot,
                    'Rank': rank,
                    'Player': name,
                    'Age': age,
                    'Team': team,
                    'Transfer_Value': transfer_value
                })
            except (ValueError, IndexError) as e:
                continue
                


Scraping data from page 1...
Scraping data from page 2...
Scraping data from page 3...
Scraping data from page 4...
Scraping data from page 5...
Scraping data from page 6...
Scraping data from page 7...
Scraping data from page 8...
Scraping data from page 9...
Scraping data from page 10...
Scraping data from page 11...
Scraping data from page 12...
Scraping data from page 13...
Scraping data from page 14...
Scraping data from page 15...
Scraping data from page 16...
Scraping data from page 17...
Scraping data from page 18...
Scraping data from page 19...
Scraping data from page 20...
Scraping data from page 21...
Scraping data from page 22...
Scraping data from page 23...
Scraping data from page 24...
Scraping data from page 25...


In [19]:
dataframe = pd.DataFrame(data, columns= titles)
dataframe.to_csv('transfer_value.csv', index= False)
print("Result has been saved to transfer_value.csv")
driver.quit()

Result has been saved to transfer_value.csv


In [21]:
def get_gemini_embedding(texts, model = 'text-embedding-004', task_type = 'retrieval_document'):
    result = genai.embed_content(
        model= model,
        content= texts,
        task_type= task_type
    )
    return result['embedding']

In [22]:
load_dotenv()
gemini_api_key = os.environ["GEMINI_API_KEY"]
genai.configure(api_key= gemini_api_key)

df1 = pd.read_csv('result.csv')
df2= pd.read_csv('transfer_value.csv')
df2 = df2.drop(columns=['Age', 'Rank', 'Team'])
df1 = df1[df1['Minutes'] > 900]
print("Getting embedding...")
embeddings1 = [get_gemini_embedding(name) for name in df1['Player'].tolist()]
embeddings2 = [get_gemini_embedding(name) for name in df2['Player'].tolist()]
print("Done")

Getting embedding...
Done


In [23]:
def find_best_match(embeddings1, embeddings2, names2, threshold=0.8):
    matches = []
    for emb1 in embeddings1:
        similarities = [1 - cosine(emb1, emb2) for emb2 in embeddings2]
        best_idx = np.argmax(similarities)
        best_score = similarities[best_idx]
        if best_score >= threshold:
            matches.append(names2[best_idx])
        else:
            matches.append(None)
    return matches

df1['matched_name'] = find_best_match(embeddings1, embeddings2, df2['Player'].tolist())

In [24]:
df2 = df2.rename(columns= {'Player' : 'Player_Name'})
merged_df = pd.merge(df1, df2, left_on='matched_name', right_on='Player_Name', how='left').drop(columns=['Player_Name', 'matched_name', 'GA90', 'Save%', "Penalty_Save%", 'CS%'])
merged_df.to_csv('football_data.csv', index= False)
print("Result has been saved in football_data.csv")

Result has been saved in football_data.csv
