In [1]:
%pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

API_KEY = os.getenv("API_KEY")

# 1. Crawl and Scrape

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

Scrape all sections dynamically


In [None]:
def scrape_dynamic_sections(url):
    """Scrape all sections from a project page except 'Contact'."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    sections = soup.find_all('section')
    results = {"URL": url}

    for section in sections:
        # Locate the section heading
        heading = section.find('h2')
        if not heading:
            continue

        section_title = heading.text.strip()
        if "contact" in section_title.lower():  # Skip 'Contact' section
            continue

        # Extract section content
        heading.decompose()  # Remove heading from the content
        content = section.get_text(strip=True)
        results[section_title] = content

    return results


 Extract Related Resources Links

In [None]:
def scrape_related_resources(url):
    """Extract links from the 'Related Resources' section."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    related_resources_section = None

    for h2 in soup.find_all('h2'):
        if "related resources" in h2.text.strip().lower():
            related_resources_section = h2.find_parent('section')
            break

    if not related_resources_section:
        return []

    # Extract links
    links = []
    for a_tag in related_resources_section.find_all('a', href=True):
        href = a_tag['href']
        full_url = href if href.startswith('http') else f"https://www.irex.org{href}"
        links.append(full_url)

    return links

def scrape_partners_links(url):
    """Extract links from the 'Partners' section."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    partners_section = None

    for h2 in soup.find_all('h2'):
        if "partners" in h2.text.strip().lower():
            partners_section = h2.find_parent('section')
            break

    if not partners_section:
        return []

    # Extract links
    links = []
    for a_tag in partners_section.find_all('a', href=True):
        href = a_tag['href']
        full_url = href if href.startswith('http') else f"https://www.irex.org{href}"
        links.append(full_url)

    return links


In [None]:
def scrape_project_page(url):
    """Scrape all sections and related resources, including partners, from a project page."""
    sections = scrape_dynamic_sections(url)
    related_resources = scrape_related_resources(url)
    partners_links = scrape_partners_links(url)

    if sections:
        sections["Related Resources"] = ", ".join(related_resources) if related_resources else "None"
        sections["Partners Links"] = ", ".join(partners_links) if partners_links else "None"
    return sections


In [None]:
def fetch_project_links(base_url, max_pages):
    """Fetch all project links from paginated pages."""
    all_links = []

    for page in range(max_pages + 1):  # Iterate through pages 0 to max_pages
        page_url = f"{base_url}?page={page}"
        print(f"Fetching links from: {page_url}")

        response = requests.get(page_url)
        if response.status_code != 200:
            print(f"Failed to fetch {page_url}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all project links on the current page
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if '/project/' in href:  # Filter links matching the project pattern
                full_url = href if href.startswith('http') else f"https://www.irex.org{href}"
                all_links.append(full_url)

    return list(set(all_links))  # Remove duplicates

def crawl_projects(base_url, max_pages):
    """Crawl project pages and extract structured data."""
    project_links = fetch_project_links(base_url, max_pages)
    print(f"Found {len(project_links)} project links.")

    all_project_data = []

    for idx, link in enumerate(project_links):
        print(f"Processing {idx + 1}/{len(project_links)}: {link}")
        try:
            project_data = scrape_project_page(link)
            if project_data:
                all_project_data.append(project_data)
        except Exception as e:
            print(f"Error processing {link}: {e}")

        time.sleep(1)  # Be polite and avoid overwhelming the server

    return all_project_data

# Crawl the site
MAX_PAGES = 12  # Adjust based on pagination
BASE_URL = "https://www.irex.org/our-work"
project_data = crawl_projects(BASE_URL, MAX_PAGES)

# Save the data to an Excel file
df = pd.DataFrame(project_data)
df.to_excel("irex_projects.xlsx", index=False)
print("Data saved to irex_projects_with_partners.xlsx")


Fetching links from: https://www.irex.org/our-work?page=0
Fetching links from: https://www.irex.org/our-work?page=1
Fetching links from: https://www.irex.org/our-work?page=2
Fetching links from: https://www.irex.org/our-work?page=3
Fetching links from: https://www.irex.org/our-work?page=4
Fetching links from: https://www.irex.org/our-work?page=5
Fetching links from: https://www.irex.org/our-work?page=6
Fetching links from: https://www.irex.org/our-work?page=7
Fetching links from: https://www.irex.org/our-work?page=8
Fetching links from: https://www.irex.org/our-work?page=9
Fetching links from: https://www.irex.org/our-work?page=10
Fetching links from: https://www.irex.org/our-work?page=11
Fetching links from: https://www.irex.org/our-work?page=12
Found 97 project links.
Processing 1/97: https://www.irex.org/project/usaid-media-program
Processing 2/97: https://www.irex.org/project/media-empowerment-democratic-sri-lanka-mend
Processing 3/97: https://www.irex.org/project/civil-society-and

# 2. Use LLM for classification and summarization

In [None]:
pip install requests



In [None]:
import requests

# Hugging Face Inference API details
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}

def classify_relevance(text):
    """
    Classify relevance using the improved prompt with examples.
    """
    prompt = f"""
    Determine if the following project is relevant to digital development or ICT4D. Respond only with 'Yes' or 'No'—do not provide any explanation.

    Examples:
    1. A project that uses ICT tools to improve education outcomes for rural students is relevant. Answer: Yes
    2. A project focused only on providing food aid during a crisis is not relevant. Answer: No
    3. A project that trains teachers on using digital platforms to deliver lessons is relevant. Answer: Yes
    4. A project addressing gender-based violence without any digital component is not relevant. Answer: No

    Text: {text}
    """
    payload = {"inputs": prompt, "parameters": {"return_full_text": False }}
    response = requests.post(API_URL, headers=HEADERS, json=payload)
    if response.status_code == 200:
        result = response.json()
        return result[0]["generated_text"].strip()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

In [None]:
def generate_summary(text):
    """
    Use Hugging Face Inference API to generate a summary for relevant projects.
    """
    payload = {
        "inputs": f"""
        Summarize the following project, including:
        - Project Name
        - Description
        - Key Statistics
        - Additional details about people, partners, and resources (if available).

        Text: {text}
        """
    }
    response = requests.post(API_URL, headers=HEADERS, json=payload)
    if response.status_code == 200:
        result = response.json()
        return result[0]["generated_text"].strip()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_excel("irex_projects.xlsx")

# Columns to include for classification and summarization
relevant_columns = [
    "Overview", "Goals", "Project Activities", "Quick Facts", "How It Works",
    "Activities", "Results", "Evidence and Results", "Success Stories",
    "Initiative Highlights", "Goals and Objectives", "Training Options",
    "Activities and Timeline", "Case Study: A Tech Company in Nigeria"
]

# Filter and combine relevant fields
df["Combined Text"] = df[relevant_columns].fillna("").apply(" ".join, axis=1)


In [None]:
# Apply classification to determine relevance
df["Relevance"] = df["Combined Text"].apply(classify_relevance)

# Save the results
df.to_excel("classified_projects.xlsx", index=False)
print("Relevance classification completed and saved!")


Relevance classification completed and saved!


In [None]:
test_text = "IREX’s Girls’ Learning Through Technology (GLTT) project in Kenya uses a technology-based approach to support girls’ education and development of ICT skills in low-tech schools."
response = classify_relevance(test_text)
print(f"Response: {response}")


Response: Answer: Yes


In [None]:
# Filter for relevant projects
relevant_projects = df[df["Relevance"] == "Yes"]

# Generate summaries for relevant projects
relevant_projects["Summary"] = relevant_projects["Combined Text"].apply(generate_summary)

# Save the summarized data
relevant_projects.to_excel("relevant_projects_with_summaries.xlsx", index=False)
print("Summarization completed and saved!")
