In [None]:
pip install requests beautifulsoup4 pandas

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Wikipedia URL containing the list of football clubs in England
url = "https://en.wikipedia.org/wiki/List_of_football_clubs_in_England"

# Headers to avoid being blocked
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Fetch the webpage content
response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"Error: Unable to fetch page. Status code {response.status_code}")
    exit()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find all tables (since Wikipedia organizes clubs in tables)
tables = soup.find_all("table", class_="wikitable")

club_data = []

# Loop through all tables to extract club data
for table in tables:
    rows = table.find_all("tr")[1:]  # Skip the header row

    for row in rows:
        cells = row.find_all("td")
        if len(cells) > 1:
            club_name = cells[0].text.strip()
            club_link = "https://en.wikipedia.org" + cells[0].find("a")["href"] if cells[0].find("a") else "No link available"

            # Save data to list
            club_data.append({
                "Club Name": club_name,
                "Wikipedia Link": club_link
            })

# Convert list to DataFrame
df = pd.DataFrame(club_data)

# Save to CSV file
output_file = "uk_football_clubs.csv"
df.to_csv(output_file, index=False)

print(f"Scraped {len(club_data)} football clubs and saved to {output_file}")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
!pip install geopy

In [None]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

# Your Google API Key (Replace with your actual key)
GOOGLE_API_KEY = "AIzaSyAJaAus6xYdQhWUwTqVbNO8Opxg_-7VB2M"

df = pd.read_csv('uk_football_clubs.csv')

# Check if required columns exist
if "Club Name" not in df.columns or "Wikipedia Link" not in df.columns:
    print("⚠️ CSV is missing required columns. Ensure 'Club Name' and 'Wikipedia Link' exist.")
    exit()

# Add missing columns if not present
if "Stadium Name" not in df.columns:
    df["Stadium Name"] = None
if "Latitude" not in df.columns:
    df["Latitude"] = None
if "Longitude" not in df.columns:
    df["Longitude"] = None

# Function to scrape stadium name from Wikipedia
def get_stadium_name(wikipedia_url):
    try:
        response = requests.get(wikipedia_url, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code != 200:
            return None
        
        soup = BeautifulSoup(response.text, "html.parser")
        infobox = soup.find("table", {"class": "infobox"})
        if not infobox:
            return None
        
        for row in infobox.find_all("tr"):
            if "Ground" in row.text or "Stadium" in row.text:
                stadium = row.find("td").text.strip()
                return stadium
        return None
    except Exception as e:
        return None

# Function to get coordinates from Google Geocoding API
def get_coordinates(stadium, club_name):
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    address = f"{stadium}, {club_name}, UK"
    
    params = {
        "address": address,
        "key": GOOGLE_API_KEY
    }
    
    response = requests.get(base_url, params=params)
    data = response.json()
    
    if data["status"] == "OK":
        location = data["results"][0]["geometry"]["location"]
        return location["lat"], location["lng"]
    else:
        with open("failed_clubs.log", "a") as log_file:
            log_file.write(f"{club_name} - {stadium} returned NO RESULT\n")
        return None, None

print(f"Processing {len(df)} clubs...")

# Process all clubs
for i, row in df.iterrows():
    club_name = row["Club Name"]
    wiki_link = row["Wikipedia Link"]

    # Skip if we already have data
    if pd.notna(row["Stadium Name"]) and pd.notna(row["Latitude"]) and pd.notna(row["Longitude"]):
        continue

    # Extract stadium name from Wikipedia
    stadium = get_stadium_name(wiki_link)
    if not stadium:
        with open("failed_clubs.log", "a") as log_file:
            log_file.write(f"{club_name} - NO STADIUM FOUND\n")
        continue

    df.at[i, "Stadium Name"] = stadium

    # Get coordinates
    lat, lon = get_coordinates(stadium, club_name)
    df.at[i, "Latitude"] = lat
    df.at[i, "Longitude"] = lon
    
    # Print progress every 20 clubs
    if i % 20 == 0:
        print(f"✅ Processed {i}/{len(df)} clubs...")

    time.sleep(1.5)  # Sleep to prevent API rate limits

# Save final output
output_file = "uk_football_clubs_with_coordinates.csv"
df.to_csv(output_file, index=False)

print(f"\n🎯 Process complete for ALL clubs! Saved as {output_file}")
print("⚠️ Check 'failed_clubs.log' for any missing stadiums.")