In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from time import sleep

In [14]:
url = 'https://raw.githubusercontent.com/Ndio-S/Grassroots/refs/heads/main/FC_with_coordinates.csv'

df = pd.read_csv(url)

print(df.head(5))

            Club Name                                     Wikipedia Link  \
0      1874 Northwich  https://en.wikipedia.org/wiki/1874_Northwich_F.C.   
1  A.F.C. Aldermaston   https://en.wikipedia.org/wiki/A.F.C._Aldermaston   
2    A.F.C. Blackpool     https://en.wikipedia.org/wiki/A.F.C._Blackpool   
3   A.F.C Bournemouth      https://en.wikipedia.org/wiki/AFC_Bournemouth   
4   A.F.C. Bridgnorth    https://en.wikipedia.org/wiki/A.F.C._Bridgnorth   

                           Stadium Name   Latitude  Longitude  
0                 Townfield, Barnton[1]  53.274171  -2.547447  
1              Waterside Park, Thatcham  51.383850  -1.153294  
2  The Mechanics, Jepson Way, Blackpool  53.778541  -3.019719  
3                            Dean Court  50.734832  -1.839078  
4                          Crown Meadow  52.536305  -2.420337  


In [18]:
#  geolocator
geolocator = Nominatim(user_agent="grassroots_locator")

if "Postcode" not in df.columns:
    df["Postcode"] = None

# Function to fetch only the postcode
def get_postcode(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True)
        return location.raw.get("address", {}).get("postcode", "N/A")
    except:
        return "N/A"

# Function to fetch the club website from the "External Links" section
def get_club_website(wiki_url):
    if wiki_url == "N/A":
        return "N/A"

    try:
        response = requests.get(wiki_url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the "External links" section
        external_links_section = None
        for h2 in soup.find_all("h2"):
            if "External links" in h2.text:
                external_links_section = h2.find_next_sibling("ul")
                break

        # Extract the first valid external link (assuming it's the official website)
        if external_links_section:
            website_link = external_links_section.find("a", href=True)
            return website_link["href"] if website_link else "N/A"

    except:
        return "N/A"

    return "N/A"

# Apply functions to each row
for i, row in df.iterrows():
    # Fetch postcode if missing
    if pd.isna(row["Postcode"]) and pd.notna(row["Latitude"]) and pd.notna(row["Longitude"]):
        df.at[i, "Postcode"] = get_postcode(row["Latitude"], row["Longitude"])
        sleep(1)  # Avoid hitting API rate limit

    # Fetch club website from Wikipedia External Links section
    if "Website" not in df.columns or pd.isna(row.get("Website")):
        df.at[i, "Website"] = get_club_website(row["Wikipedia Link"])
        sleep(0.5)  # Prevent Wikipedia blocking

# Save enhanced dataset
df.to_csv("FC_with_websites_postcodes.csv", index=False)
print("✅ Postcodes & websites added successfully!")

✅ Postcodes & websites added successfully!


In [22]:
df_1 = pd.read_csv('FC_with_websites_postcodes.csv')
df_1.head()

Unnamed: 0,Club Name,Wikipedia Link,Stadium Name,Latitude,Longitude,Postcode,Website
0,1874 Northwich,https://en.wikipedia.org/wiki/1874_Northwich_F.C.,"Townfield, Barnton[1]",53.274171,-2.547447,CW8 4NJ,
1,A.F.C. Aldermaston,https://en.wikipedia.org/wiki/A.F.C._Aldermaston,"Waterside Park, Thatcham",51.38385,-1.153294,RG7 4LX,
2,A.F.C. Blackpool,https://en.wikipedia.org/wiki/A.F.C._Blackpool,"The Mechanics, Jepson Way, Blackpool",53.778541,-3.019719,FY4 5FD,
3,A.F.C Bournemouth,https://en.wikipedia.org/wiki/AFC_Bournemouth,Dean Court,50.734832,-1.839078,BH7 7AF,
4,A.F.C. Bridgnorth,https://en.wikipedia.org/wiki/A.F.C._Bridgnorth,Crown Meadow,52.536305,-2.420337,WV16 4DB,


In [30]:
df_1.info('Website')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084 entries, 0 to 1083
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Club Name       1084 non-null   object 
 1   Wikipedia Link  1084 non-null   object 
 2   Stadium Name    1084 non-null   object 
 3   Latitude        1084 non-null   float64
 4   Longitude       1084 non-null   float64
 5   Postcode        1084 non-null   object 
 6   Website         0 non-null      float64
dtypes: float64(3), object(4)
memory usage: 59.4+ KB


In [36]:
#Websites did not work so will remove
df_1.drop(columns='Website')

Unnamed: 0,Club Name,Wikipedia Link,Stadium Name,Latitude,Longitude,Postcode
0,1874 Northwich,https://en.wikipedia.org/wiki/1874_Northwich_F.C.,"Townfield, Barnton[1]",53.274171,-2.547447,CW8 4NJ
1,A.F.C. Aldermaston,https://en.wikipedia.org/wiki/A.F.C._Aldermaston,"Waterside Park, Thatcham",51.383850,-1.153294,RG7 4LX
2,A.F.C. Blackpool,https://en.wikipedia.org/wiki/A.F.C._Blackpool,"The Mechanics, Jepson Way, Blackpool",53.778541,-3.019719,FY4 5FD
3,A.F.C Bournemouth,https://en.wikipedia.org/wiki/AFC_Bournemouth,Dean Court,50.734832,-1.839078,BH7 7AF
4,A.F.C. Bridgnorth,https://en.wikipedia.org/wiki/A.F.C._Bridgnorth,Crown Meadow,52.536305,-2.420337,WV16 4DB
...,...,...,...,...,...,...
1079,Yateley United,https://en.wikipedia.org/wiki/Yateley_United_F.C.,Sean Devereux Park,51.349822,-0.828742,GU46 7ST
1080,Yaxley,https://en.wikipedia.org/wiki/Yaxley_F.C.,"Leading Drove, Yaxley",52.509370,-0.257874,PE7 3NA
1081,Yeovil Town,https://en.wikipedia.org/wiki/Yeovil_Town_F.C.,Huish Park,50.950243,-2.673963,BA22 8YF
1082,York City,https://en.wikipedia.org/wiki/York_City_F.C.,York Community Stadium,53.984703,-1.051384,YO32 9JS


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime

# Load your existing club dataset
df = pd.read_csv("FC_updated.csv")

# Ensure "League" column exists
if "League" not in df.columns:
    df["League"] = None

# Wikipedia URL for club leagues
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_football_clubs_in_England"

def fetch_league_data():
    """Scrapes league names for clubs from Wikipedia."""
    response = requests.get(WIKI_URL)
    soup = BeautifulSoup(response.text, "html.parser")

    tables = soup.find_all("table", {"class": "wikitable"})
    club_league_map = {}

    for table in tables:
        rows = table.find_all("tr")[1:]  # Skip headers

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:  # Ensure there are at least two columns (Club, League)
                club_name = cols[0].get_text(strip=True)  # First column: Club name
                league_name = cols[1].get_text(strip=True)  # Second column: League name

                club_league_map[club_name.lower()] = league_name  # Store in dictionary

    return club_league_map

# Fetch latest club leagues
club_league_map = fetch_league_data()

# Assign leagues to clubs in dataset
for i, row in df.iterrows():
    club_name = row["Club Name"].strip().lower()
    df.at[i, "League"] = club_league_map.get(club_name, "Unknown")

In [74]:
df_1.head()

Unnamed: 0,Club Name,Wikipedia Link,Stadium Name,Latitude,Longitude,Postcode,Website,League
0,1874 Northwich,https://en.wikipedia.org/wiki/1874_Northwich_F.C.,"Townfield, Barnton[1]",53.274171,-2.547447,CW8 4NJ,,Midland League Premier Division
1,A.F.C. Aldermaston,https://en.wikipedia.org/wiki/A.F.C._Aldermaston,"Waterside Park, Thatcham",51.38385,-1.153294,RG7 4LX,,We League Division Two
2,A.F.C. Blackpool,https://en.wikipedia.org/wiki/A.F.C._Blackpool,"The Mechanics, Jepson Way, Blackpool",53.778541,-3.019719,FY4 5FD,,North West Counties League Division One North
3,A.F.C Bournemouth,https://en.wikipedia.org/wiki/AFC_Bournemouth,Dean Court,50.734832,-1.839078,BH7 7AF,,Premier League
4,A.F.C. Bridgnorth,https://en.wikipedia.org/wiki/A.F.C._Bridgnorth,Crown Meadow,52.536305,-2.420337,WV16 4DB,,Midland League Division One


In [84]:
df_1['League'].nunique()

60

In [89]:
df_1.drop(columns='Website')

Unnamed: 0,Club Name,Wikipedia Link,Stadium Name,Latitude,Longitude,Postcode,League
0,1874 Northwich,https://en.wikipedia.org/wiki/1874_Northwich_F.C.,"Townfield, Barnton[1]",53.274171,-2.547447,CW8 4NJ,Midland League Premier Division
1,A.F.C. Aldermaston,https://en.wikipedia.org/wiki/A.F.C._Aldermaston,"Waterside Park, Thatcham",51.383850,-1.153294,RG7 4LX,We League Division Two
2,A.F.C. Blackpool,https://en.wikipedia.org/wiki/A.F.C._Blackpool,"The Mechanics, Jepson Way, Blackpool",53.778541,-3.019719,FY4 5FD,North West Counties League Division One North
3,A.F.C Bournemouth,https://en.wikipedia.org/wiki/AFC_Bournemouth,Dean Court,50.734832,-1.839078,BH7 7AF,Premier League
4,A.F.C. Bridgnorth,https://en.wikipedia.org/wiki/A.F.C._Bridgnorth,Crown Meadow,52.536305,-2.420337,WV16 4DB,Midland League Division One
...,...,...,...,...,...,...,...
1079,Yateley United,https://en.wikipedia.org/wiki/Yateley_United_F.C.,Sean Devereux Park,51.349822,-0.828742,GU46 7ST,Combined Counties League Division One
1080,Yaxley,https://en.wikipedia.org/wiki/Yaxley_F.C.,"Leading Drove, Yaxley",52.509370,-0.257874,PE7 3NA,United Counties League Premier Division South
1081,Yeovil Town,https://en.wikipedia.org/wiki/Yeovil_Town_F.C.,Huish Park,50.950243,-2.673963,BA22 8YF,National League
1082,York City,https://en.wikipedia.org/wiki/York_City_F.C.,York Community Stadium,53.984703,-1.051384,YO32 9JS,National League


In [91]:
df.to_csv(f"FC_with_leagues_.csv", index=False)