<a href="https://colab.research.google.com/github/JP7599/AI-Article-Data-Scraper/blob/main/AIdatascraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai beautifulsoup4
!pip install --upgrade openai

Collecting openai
  Using cached openai-1.57.4-py3-none-any.whl.metadata (24 kB)
Using cached openai-1.57.4-py3-none-any.whl (390 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.55.3
    Uninstalling openai-1.55.3:
      Successfully uninstalled openai-1.55.3
Successfully installed openai-1.57.4


In [None]:
  import os
  import requests
  from bs4 import BeautifulSoup
  import re
  import json
  import pandas as pd
  from openai import OpenAI
  from openai import ChatCompletion

In [None]:
!pip show openai


Name: openai
Version: 1.55.3
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: Apache-2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 


In [None]:

# Configuration

OPENAI_API_KEY = "#" #Your api key
BING_API_KEY = "#" #your api key
BING_SEARCH_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"

#queries to discover relevant links
queries = [
    "AI product patent growth for the future",
    "forecast for AI patent filings",
    "AI patent trend projections"
]
#num of links fetched per query
links_per_query = 10

In [None]:

# Functions

#relevant link search on bing.
def search_links(query, count=10):
    headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
    params = {
        "q": query,
        "count": count,
        "textDecorations": True,
        "textFormat": "HTML",
        "safeSearch": "Off"
    }
    response = requests.get(BING_SEARCH_ENDPOINT, headers=headers, params=params)
    data = response.json()
    links = []
    for webPage in data.get("webPages", {}).get("value", []):
        link = webPage.get("url")
        if link:
            links.append(link)
    return links

def scrape_and_clean_text(url):
    #fetch website and return cleaned html content
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, "html.parser")

        #remove not needed stuff
        for tag in soup(["script", "style", "header", "footer", "nav"]):
            tag.decompose()

        text = soup.get_text(separator=" ")
        text = re.sub(r"\s+", " ", text).strip()
        return text
    except:
        return None

def create_prompt(article_text):
    return f"""
        You are an expert in analyzing the impact of climate change on agriculture.
        From the provided text, extract the following numerical and boolean information.

        If information is not directly mentioned, return null for numeric fields and false for booleans.

        Return result as JSON (no extra text):
        {{
          "annual_crop_yield_change_percentage": null or a number,
          "projected_temperature_rise_by_year": [{{"year": 2025, "temperature_rise_celsius": number}}, ...],
          "projected_precipitation_change_by_year": [{{"year": 2025, "precipitation_change_percentage": number}}, ...],
          "dominant_region_impact_percentage": null or a number,
          "global_crop_failure_probability": null or a number,
          "major_affected_crops": ["crop1", "crop2", ...],
          "annual_soil_quality_degradation_rate": null or a number,
          "key_adaptation_strategy_count": number,
          "water_resource_availability_change": null or a number,
          "agriculture_sector_emissions_percentage": null or a number,
          "climate_policy_initiatives_count": number,
          "economic_loss_estimate_by_year": [{{"year": 2025, "loss_billion_usd": number}}, ...],
          "migration_due_to_agriculture_loss": null or a number,
          "technology_investment_growth_rate": null or a number,
          "food_security_index_change": null or a number,
          "biodiversity_loss_percentage": null or a number,
          "forecasted_population_impact_by_year": [{{"year": 2025, "population_affected": number}}, ...],
          "livelihood_dependency_change": null or a number,
          "region_with_highest_innovation_adoption": "region_name" or null,
          "farm_income_variation_by_year": [{{"year": 2025, "income_variation_percentage": number}}, ...],
          "carbon_sequestration_by_agriculture": null or a number
        }}

        - annual_crop_yield_change_percentage: Percentage change in crop yield annually.
        - projected_temperature_rise_by_year: Yearly rise in temperature forecasts.
        - projected_precipitation_change_by_year: Yearly change in precipitation forecasts.
        - dominant_region_impact_percentage: Percentage of impact in the most affected region.
        - global_crop_failure_probability: Probability of global crop failure in percentage.
        - major_affected_crops: Crops most impacted by climate change.
        - annual_soil_quality_degradation_rate: Rate of soil degradation per year.
        - key_adaptation_strategy_count: Count of strategies for adapting to impacts.
        - water_resource_availability_change: Change in water resources due to climate change.
        - agriculture_sector_emissions_percentage: Percentage of emissions from agriculture.
        - climate_policy_initiatives_count: Number of policies or initiatives mentioned.
        - economic_loss_estimate_by_year: Economic losses per year in billions of USD.
        - migration_due_to_agriculture_loss: Number of people displaced due to agriculture loss.
        - technology_investment_growth_rate: Annual growth rate of investment in agriculture technologies.
        - food_security_index_change: Change in food security index annually.
        - biodiversity_loss_percentage: Loss of biodiversity in agricultural regions.
        - forecasted_population_impact_by_year: Population forecasted to be impacted annually.
        - livelihood_dependency_change: Change in reliance on agriculture for livelihood.
        - region_with_highest_innovation_adoption: Region leading in climate-smart innovation.
        - farm_income_variation_by_year: Yearly variation in farm income in percentage.
        - carbon_sequestration_by_agriculture: Amount of carbon captured annually by agriculture.

        Text:
        ---------------------
        {article_text}
        ---------------------
        """


#call openai api and return json string
def call_llm(prompt, model="gpt-3.5-turbo"):
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant that returns clean JSON."},
                {"role": "user", "content": prompt}
            ],
            model="gpt-3.5-turbo",
            max_tokens=1000
        )
        return response.choices[0].message.content


In [None]:

# Test run 1 URL
print("Starting test run...")
test_url = "https://www.nature.org/en-us/newsroom/growing-threats-how-climate-change-will-exacerbate-environmental-impacts-agriculture/"
test_article_text = scrape_and_clean_text(test_url)

if not test_article_text:
    print(f"{test_url} URL does not work.")
else:
    test_prompt = create_prompt(test_article_text)
    test_raw_output = call_llm(test_prompt)
    print("LLM Raw Output:\n", test_raw_output)

    try:
        test_data = json.loads(test_raw_output)
        print("Parsed JSON:\n", test_data)

        # Convert to DataFrame for inspection
        test_df = pd.DataFrame([{
            "source_url": test_url,
            "annual_crop_yield_change_percentage": test_data.get("annual_crop_yield_change_percentage"),
            "projected_temperature_rise_by_year": test_data.get("projected_temperature_rise_by_year", []),
            "projected_precipitation_change_by_year": test_data.get("projected_precipitation_change_by_year", []),
            "dominant_region_impact_percentage": test_data.get("dominant_region_impact_percentage"),
            "global_crop_failure_probability": test_data.get("global_crop_failure_probability"),
            "major_affected_crops": test_data.get("major_affected_crops", []),
            "annual_soil_quality_degradation_rate": test_data.get("annual_soil_quality_degradation_rate"),
            "key_adaptation_strategy_count": test_data.get("key_adaptation_strategy_count"),
            "water_resource_availability_change": test_data.get("water_resource_availability_change"),
            "agriculture_sector_emissions_percentage": test_data.get("agriculture_sector_emissions_percentage"),
            "climate_policy_initiatives_count": test_data.get("climate_policy_initiatives_count"),
            "economic_loss_estimate_by_year": test_data.get("economic_loss_estimate_by_year", []),
            "migration_due_to_agriculture_loss": test_data.get("migration_due_to_agriculture_loss"),
            "technology_investment_growth_rate": test_data.get("technology_investment_growth_rate"),
            "food_security_index_change": test_data.get("food_security_index_change"),
            "biodiversity_loss_percentage": test_data.get("biodiversity_loss_percentage"),
            "forecasted_population_impact_by_year": test_data.get("forecasted_population_impact_by_year", []),
            "livelihood_dependency_change": test_data.get("livelihood_dependency_change"),
            "region_with_highest_innovation_adoption": test_data.get("region_with_highest_innovation_adoption"),
            "farm_income_variation_by_year": test_data.get("farm_income_variation_by_year", []),
            "carbon_sequestration_by_agriculture": test_data.get("carbon_sequestration_by_agriculture"),
        }])

        print("\nTest DataFrame:\n", test_df)
    except json.JSONDecodeError:
        print("JSON parsing failed for the test run. Check LLM output or prompt.")

print("Test run completed. If this looks good, proceed with the full pipeline.")


Starting test run...
LLM Raw Output:
 {
    "annual_crop_yield_change_percentage": null,
    "projected_temperature_rise_by_year": [
        {
            "year": 2025,
            "temperature_rise_celsius": null
        }
    ],
    "projected_precipitation_change_by_year": [
        {
            "year": 2025,
            "precipitation_change_percentage": null
        }
    ],
    "dominant_region_impact_percentage": null,
    "global_crop_failure_probability": null,
    "major_affected_crops": [
        "rice",
        "soybean",
        "corn",
        "wheat"
    ],
    "annual_soil_quality_degradation_rate": null,
    "key_adaptation_strategy_count": null,
    "water_resource_availability_change": null,
    "agriculture_sector_emissions_percentage": null,
    "climate_policy_initiatives_count": null,
    "economic_loss_estimate_by_year": [
        {
            "year": 2025,
            "loss_billion_usd": null
        }
    ],
    "migration_due_to_agriculture_loss": null,
   

In [None]:

# Main Pipeline (Run after test works smoothly so that you dont go broke for nothing)
'''
all_links = []
for q in queries:
    found_links = search_links(q, count=links_per_query)
    all_links.extend(found_links)

# Deduplicate links
all_links = list(set(all_links))

all_extracted_data = []
for url in all_links:
    print(f"Processing: {url}")
    article_text = scrape_and_clean_text(url)
    if not article_text or len(article_text) < 500:  #skip very short texts
        continue

    prompt = create_prompt(article_text)
    raw_output = call_llm(prompt)

    try:
        data = json.loads(raw_output)
    except json.JSONDecodeError:
        print(f"Failed to parse JSON from {url}. Output:\n{raw_output}")
        continue

    entry = {
            "source_url": test_url,
            "annual_crop_yield_change_percentage": test_data.get("annual_crop_yield_change_percentage"),
            "projected_temperature_rise_by_year": test_data.get("projected_temperature_rise_by_year", []),
            "projected_precipitation_change_by_year": test_data.get("projected_precipitation_change_by_year", []),
            "dominant_region_impact_percentage": test_data.get("dominant_region_impact_percentage"),
            "global_crop_failure_probability": test_data.get("global_crop_failure_probability"),
            "major_affected_crops": test_data.get("major_affected_crops", []),
            "annual_soil_quality_degradation_rate": test_data.get("annual_soil_quality_degradation_rate"),
            "key_adaptation_strategy_count": test_data.get("key_adaptation_strategy_count"),
            "water_resource_availability_change": test_data.get("water_resource_availability_change"),
            "agriculture_sector_emissions_percentage": test_data.get("agriculture_sector_emissions_percentage"),
            "climate_policy_initiatives_count": test_data.get("climate_policy_initiatives_count"),
            "economic_loss_estimate_by_year": test_data.get("economic_loss_estimate_by_year", []),
            "migration_due_to_agriculture_loss": test_data.get("migration_due_to_agriculture_loss"),
            "technology_investment_growth_rate": test_data.get("technology_investment_growth_rate"),
            "food_security_index_change": test_data.get("food_security_index_change"),
            "biodiversity_loss_percentage": test_data.get("biodiversity_loss_percentage"),
            "forecasted_population_impact_by_year": test_data.get("forecasted_population_impact_by_year", []),
            "livelihood_dependency_change": test_data.get("livelihood_dependency_change"),
            "region_with_highest_innovation_adoption": test_data.get("region_with_highest_innovation_adoption"),
            "farm_income_variation_by_year": test_data.get("farm_income_variation_by_year", []),
            "carbon_sequestration_by_agriculture": test_data.get("carbon_sequestration_by_agriculture"),
        }
    all_extracted_data.append(entry)

df = pd.DataFrame(all_extracted_data)
print(df.head())

# Save to CSV
df.to_csv("ai_patent_forecast_data.csv", index=False)
print("Full pipeline run completed. Data saved to ai_patent_forecast_data.csv.")

# Remove the triple quotes and run the full pipeline after verifying the test.
'''

'\nall_links = []\nfor q in queries:\n    found_links = search_links(q, count=links_per_query)\n    all_links.extend(found_links)\n\n# Deduplicate links\nall_links = list(set(all_links))\n\nall_extracted_data = []\nfor url in all_links:\n    print(f"Processing: {url}")\n    article_text = scrape_and_clean_text(url)\n    if not article_text or len(article_text) < 500:  #skip very short texts\n        continue\n\n    prompt = create_prompt(article_text)\n    raw_output = call_llm(prompt)\n\n    try:\n        data = json.loads(raw_output)\n    except json.JSONDecodeError:\n        print(f"Failed to parse JSON from {url}. Output:\n{raw_output}")\n        continue\n\n    entry = {\n            "source_url": test_url,\n            "annual_crop_yield_change_percentage": test_data.get("annual_crop_yield_change_percentage"),\n            "projected_temperature_rise_by_year": test_data.get("projected_temperature_rise_by_year", []),\n            "projected_precipitation_change_by_year": test_data