In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta

In [None]:
day_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}

In [None]:
def get_weekdays_between(start, end, day_of_week='Thursday'):
    """Generate all Thursdays between two dates."""
    days = []
    current = start
    while current <= end:
        if current.weekday() == day_map[day_of_week]:
            days = days + [current]
        current += timedelta(days=1)
    return days

def get_gameweek_number(urls):
    """Extract gameweek number from URLs."""
    gameweek_gets = []
    for url in urls:
        if 'gameweek' in url:
            parts = url.split('-')
            for i in range(len(parts)):
                if parts[i].startswith('gameweek'):
                    gameweek_gets = gameweek_gets + [parts[i+1]]

    # check if we have any gameweek numbers and that there is one unique value
    if gameweek_gets:
        unique_gws = set(gameweek_gets)
        if len(unique_gws) == 1:
            return int(unique_gws.pop())
        else:
            print(f"Multiple gameweek numbers found: {unique_gws}")
            return "uncertain"
    else:
        print("No gameweek number found in URLs.")
        print(urls)
        return ""


def list_articles_for_date(date: datetime):
    base_url = f"https://www.fantasyfootballscout.co.uk/{date.year}/{date.strftime('%m')}/{date.strftime('%d')}/"
    try:
        res = requests.get(base_url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')

        # Find all anchor tags with hrefs
        links = soup.find_all("a", href=True)
        urls = set()

        for link in links:
            href = link["href"]
            if href.startswith(base_url):
                urls.add(href)

        # filter to urls containing 'injury'
        # urls = [url for url in urls if 'injury' in url or 'team-news' in url or 'team-updates' in url or 'fpl' in url]
        urls = [url for url in urls if 'gameweek' in url or 'fpl' in url]
        urls = [url for url in urls if 'comments' not in url]

        return sorted(urls)

    except Exception as e:
        print(f"Failed to fetch or parse {base_url}: {e}")
        return []

def scrape_article(url):
    try:
        res = requests.get(url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')
        content_div = soup.find("article")
        if not content_div:
            return None
        paragraphs = content_div.find_all("p")
        text = "\n\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return text
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

def get_season(date):
    year = int(date.strftime('%y'))
    month = int(date.strftime('%m'))
    if month < 8:
        return f"{year - 1}-{year}"
    else:
        return f"{year}-{year + 1}"

In [None]:
articles = []
missing_date_urls = []

# for each year
for season_year in [2024]:
    print(season_year)
    season_end = datetime(season_year + 1, 5, 31)
    season_start = datetime(season_year, 8, 1)

    thursdays = get_weekdays_between(season_start, season_end)
    saturdays = get_weekdays_between(season_start, season_end, 'Saturday')
    update_dates = thursdays + saturdays


    for date in tqdm(update_dates[0:5]):

        urls = list_articles_for_date(date)
        day_of_week = date.strftime('%A')

        season = get_season(date)

        if len(urls) > 0:
            gameweek = get_gameweek_number(urls)
            for url in urls:
                content = scrape_article(url)
                if content:
                    articles.append({
                        "date": date.strftime("%Y-%m-%d"),
                        "gameweek": gameweek,
                        "season": season,
                        "day_of_week": day_of_week,
                        "url": url,
                        "content": content
                    })
        else:
            missing_date_urls = missing_date_urls + [date.strftime('%Y-%m-%d')]

print(f"Missing URLs for dates: ")
for d in missing_date_urls:
    print(f"- {d}")

In [None]:
df = pd.DataFrame(articles)

In [None]:
len(missing_date_urls)

In [None]:
df.shape

In [None]:
# sort by date
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')

df.head()

In [None]:
df['day_of_week'].value_counts()

In [None]:
import pandas as pd
import json
import subprocess
from tqdm import tqdm
import re
import ast

In [None]:
def query_ollama(prompt, model="llama3"):
    try:
        result = subprocess.run(
            ["ollama", "run", model],
            input=prompt,
            text=True,
            capture_output=True,
            check=True,
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Ollama error: {e.stderr}")
        return ""

def extract_injuries_with_summary_ollama(content):
    prompt = f"""
You are a helpful assistant reading a sports news article. Ignoring the comments, your task is to:

1. Summarise the article in 2–3 sentences.
2. Extract the names of any players mentioned as potentially injured/suspended/missing and assess the likelihood they will miss the game (as certain/high/low/unsure).
3. Extract the names of players who definitely will be playing

Return your answer as a JSON with the fields:
- "summary": <string>
- "missing_players": <list of dicts with keys "player", "team", "reason", and "absence_likelihood">
- "playing_players": <list of dicts with keys "player", "team">

Here is the article content:
\"\"\"
{content}
\"\"\"
"""
    response = query_ollama(prompt)

    # Attempt to extract the first JSON-like block using regex
    try:
        json_match = re.search(r"\{.*\}", response, re.DOTALL)
        if json_match:
            json_text = json_match.group()
        else:
            raise ValueError("No JSON object found in response")

        # Try parsing with standard json
        try:
            parsed = json.loads(json_text)
        except json.JSONDecodeError:
            # Fallback to safe evaluation
            parsed = ast.literal_eval(json_text)


        return parsed.get("summary", ""), parsed.get("missing_players", []), parsed.get("playing_players", [])

    except Exception as e:
        print(f"Parsing error: {e}\nRaw response:\n{response}")
        return "", [], []

In [None]:
df.head()

In [None]:
df['season'].value_counts()

In [None]:
# substitute to 2024 and between game week 20 to 30
df_sub = df[df['season'] == '24-25']
df_sub.shape

In [None]:
summaries = []
missing = []
playing = []

for _, row in tqdm(df_sub.iterrows(), total=len(df_sub)):
    summary, absent, confirmed = extract_injuries_with_summary_ollama(row['content'])

    # sort absent by team name
    # absent = sorted(absent, key=lambda x: x.get('team', ''))

    summaries.append(summary)
    missing.append(absent)
    playing.append(confirmed)

In [None]:
df_sub['summary'] = summaries
df_sub['missing_players'] = missing
df_sub['confirmed_players'] = playing

In [None]:
df_sub

In [None]:
df_sub.shape

In [None]:
df_use = df_sub

In [None]:
df_use = df_use[df_use['missing_players'].apply(lambda x: isinstance(x, list))]
df_use = df_use[df_use['confirmed_players'].apply(lambda x: isinstance(x, list))]

In [None]:
# drop where missing_players, players is ''
df_use = df_use[df_use['missing_players'].apply(lambda x: x['players'] != '')]

In [None]:
# subset to where a given player is mentioned
def player_subset(df, filter_name):
    """Subset DataFrame to rows where player_name is mentioned in missing or confirmed players."""

    player_missing = df[df['missing_players'].apply(lambda x: any(filter_name in player['player'] for player in x))]
    # player_confirmed = df[df['confirmed_players'].apply(lambda x: any(filter_name.lower() in player['player'].lower() for player in x))]

    player_mentions = pd.concat([player_missing], ignore_index=True)

    # subset the missing_players to only those mentioning Ben White
    player_mentions['missing_players'] = player_mentions['missing_players'].apply(
        lambda x: [player for player in x if filter_name in player['player']]
    )
    player_mentions['confirmed_players'] = player_mentions['confirmed_players'].apply(
        lambda x: [player for player in x if filter_name in player['player']]
    )

    return player_mentions


In [None]:
df_use.loc[1, 'missing_players']

In [None]:
df_use.shape

In [None]:
df_use['missing_players']

In [None]:
df_use['missing_players']

In [None]:
# ben white
# 25-30 2024
# arsenal

df_haaland = player_subset(df_use, 'Haaland')


In [None]:
df_haaland

In [None]:
for i, row in df_haaland.iterrows():
    print(row['missing_players'])
    print(row['confirmed_players'])
    print(row['url'])
