**Install packages.**

In [None]:
!pip install spacy pandas rapidfuzz newspaper3k lxml_html_clean validators
# Download model.
!python -m spacy download en_core_web_trf

**Mount to Google drive.**

In [None]:
# Connecting to google drive.

import os
from google.colab import drive

# Define the base path for your Google Drive.
base_path = '/content/drive'

# Define the specific folder path within your Google Drive.
folder_path = 'MyDrive/Colab Notebooks/Crafting Tech'

# Combine the base path and folder path to create the full mount path.
full_project_path = os.path.join(base_path, folder_path)

# Mount your drive.
drive.mount(base_path, force_remount=True)

Mounted at /content/drive


**Taxonomies**

In [None]:
# Define taxonomy for post-war problems.
taxonomy = {
    "Post-War Problems": {
        "Security": ["landmines", "militia violence", "arms trafficking", "civil unrest", "violence"],
        "Governance": ["corruption", "lack of elections", "political instability", "power vacuum"],
        "Economy": ["unemployment", "collapsed economy", "poverty", "inflation"],
        "Health": ["trauma", "disease outbreaks", "mental health", "lack of hospitals", "malnutrition"],
        "Displacement": ["refugees", "internally displaced persons", "repatriation", "migration"],
        "Infrastructure": ["destroyed infrastructure", "lack of electricity", "damaged roads", "communication breakdown"]
    }
}

# Define taxonomy for conflict types.
conflict_types = [
    "civil war", "ethnic conflict", "religious conflict", "invasion", "border war",
    "insurgency", "terrorism", "revolution", "proxy war", "occupation"
]

**Functions.**

In [None]:
# NLP, text extraction, and matching
import spacy
from newspaper import Article
from rapidfuzz import fuzz, process

# Data handling and utilities
import pandas as pd
import numpy as np
import os, re, json, datetime
from typing import List, Dict
from collections import Counter
import validators
from dateutil import parser as date_parser

# Load the SpaCy transformer model with GPU acceleration for faster processing.
spacy.require_gpu()
nlp = spacy.load("en_core_web_trf")

# Extract article text from a URL
def extract_text_from_url(url: str) -> str:
    """
    Downloads and extracts main article text from a given URL using newspaper3k.
    Returns an empty string if extraction fails.
    """
    article = Article(url)
    try:
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"❌ Failed to extract from URL: {url}\nReason: {e}")
        return ""

def ensure_file_exists(filename):
    """
    Creates the file if it doesn't exist.
    Adds CSV headers if it's a .csv file.
    """
    if not os.path.exists(filename):
        with open(filename, 'w') as f:
            print(f"✅ Created file: {filename}")
            # You can optionally add headers or initial content here if needed.
            if filename.endswith(".csv"):
              # Generates the first line of the .csv-file.
              f.write("region,actors,problem,description,timeline,conflict_type,link\n")

def extractActors(doc, percentile=90, score_cutoff=85, top_n=5):
    """
    Extracts a list of prominent actors (ORG, PERSON, NORP) from the document.
    Uses frequency thresholds and fuzzy matching to identify key entities.
    """
    actors = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "NORP"]]
    actor_frequencies = Counter(actors)

    if not actor_frequencies:
        return ["Unknown"]

    # Adaptive threshold.
    freq_values = list(actor_frequencies.values())
    threshold = max(1, int(np.percentile(freq_values, percentile)))

    frequent_actors = [r for r in actor_frequencies if actor_frequencies[r] >= threshold]

    # Fuzzy match & rank.
    matched = []
    for actor in frequent_actors:
        match = process.extractOne(actor, actors, scorer=fuzz.partial_ratio, score_cutoff=score_cutoff)
        if match:
            matched.append((match[0], match[1]))

    matched.sort(key=lambda x: (x[1], actor_frequencies[x[0]]), reverse=True)

    # Select top_n unique actors.
    selected_actors = []
    seen_actors = set()  # Keep track of seen actors to avoid duplicates.
    for actor, score in matched:
        if actor not in seen_actors:
            selected_actors.append(actor)
            seen_actors.add(actor)
            if len(selected_actors) == top_n:
                break

    return selected_actors if selected_actors else ["Unknown"]

def extractRegion(doc, percentile=90, score_cutoff=85, top_n=5):
    """
    Extracts regions (GPE, LOC) using frequency and fuzzy filtering.
    Returns top_n most relevant geographical entities.
    """
    regions = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
    region_frequencies = Counter(regions)

    if not region_frequencies:
        return ["Unknown"]

    # Adaptive threshold.
    freq_values = list(region_frequencies.values())
    threshold = max(1, int(np.percentile(freq_values, percentile)))

    frequent_regions = [r for r in region_frequencies if region_frequencies[r] >= threshold]

    # Fuzzy match & rank.
    matched = []
    for region in frequent_regions:
        match = process.extractOne(region, regions, scorer=fuzz.partial_ratio, score_cutoff=score_cutoff)
        if match:
            matched.append((match[0], match[1]))

    matched.sort(key=lambda x: (x[1], region_frequencies[x[0]]), reverse=True)

    # Select top_n unique regions.
    selected_regions = []
    seen_regions = set()  # Keep track of seen regions to avoid duplicates.
    for region, score in matched:
        if region not in seen_regions:
            selected_regions.append(region)
            seen_regions.add(region)
            if len(selected_regions) == top_n:
                break

    return selected_regions if selected_regions else ["Unknown"]


def extractTimeline(doc):
    """
    Extracts the most relevant date associated with post-war problems.
    Prioritizes dates with higher specificity and relevance to problem keywords.
    """
    problem_keywords = [kw.lower() for sublist in taxonomy["Post-War Problems"].values() for kw in sublist]
    date_candidates = []

    for ent in doc.ents:
        if ent.label_ != "DATE":
            continue

        date_text = ent.text.strip()
        context_window = doc[max(ent.start - 5, 0): ent.end + 5].text.lower()

        # Only consider if the date includes a year.
        if not re.search(r"\d{4}", date_text):
            continue

        # Check if context contains any relevant problem keyword.
        if any(kw in context_window for kw in problem_keywords):
            try:
                parsed_date = date_parser.parse(date_text, fuzzy=True)
                iso_date = parsed_date.strftime("%Y-%m-%d")

                # Optionally: filter out dates before 1990.
                if parsed_date.year >= 1990:
                    specificity_score = len(re.findall(r"\d", date_text))
                    date_candidates.append((iso_date, specificity_score))
            except Exception:
                continue

    if not date_candidates:
        return "Unknown"

    # Sort by specificity (more digits = better date).
    date_candidates.sort(key=lambda x: x[1], reverse=True)
    return date_candidates[0][0]

def extractConflictType(doc, upper_cutoff=95, lower_cutoff=70, step=5, window_size=3):
    """
    Matches document content to predefined conflict types using fuzzy matching.
    Searches over sentence windows, relaxing thresholds until a match is found.
    """
    type_scores = []
    current_cutoff = upper_cutoff
    # Filter on the cutoff frequency of occurence.
    while current_cutoff >= lower_cutoff and not type_scores:
        for i in range(0, len(list(doc.sents)), window_size):
            window_text = " ".join([sent.text.lower() for sent in list(doc.sents)[i:i + window_size]])
            result = process.extractOne(window_text, conflict_types, scorer=fuzz.partial_ratio, score_cutoff=current_cutoff)
            if result:
                type_scores.append(result[0])

        # If no match is found, lower the cutoff value.
        current_cutoff -= step

    if not type_scores:
        return "Unknown"

    # Return the most frequent conflict type found.
    most_common = Counter(type_scores).most_common(1)
    return most_common[0][0]

def extractProblems(doc, taxonomy, regions, actors, timeline, conflict_type , score_cutoff=85):
    """
    Identifies and extracts sentences containing post-war problems using keyword matching.
    Each match is combined with contextual information for structured output.
    """
    extracted_data = []
    for sent in doc.sents:
        sentence_text = sent.text.lower()
        for category, keywords in taxonomy["Post-War Problems"].items():
            # Optimized keyword matching with process.extractOne
            result = process.extractOne(
                sentence_text, keywords, scorer=fuzz.partial_ratio, score_cutoff=score_cutoff)
            if result:
                matched_keyword = result[0] if isinstance(result[0], str) else result[0][0]
            else:
                matched_keyword = None  # Or some other default value.
            # If we find a match we add it to the extracted_data list.
            if matched_keyword:
                extracted_data.append({
                    "regions": regions,
                    "actors": actors,
                    "problem": matched_keyword,
                    "description": sent.text,
                    "timeline": timeline,
                    "conflict_type": conflict_type
                })
                break  # Move to the next sentence after finding a match.

    return extracted_data

def extract_entities_and_context(doc, taxonomy: dict) -> List[Dict]:
    """
    Extracts all relevant structured information from a SpaCy doc:
    actors, regions, timeline, conflict type, and problem descriptions.
    """
    # Extract actors.
    actors = extractActors(doc, percentile=90)

    # Extract region
    regions = extractRegion(doc, percentile=90, score_cutoff=85, top_n=5)

    # Extract timeline
    timeline_matches = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    timeline = next((date for date in timeline_matches if re.search(r"\d{4}", date)), "Unknown")
    #timeline = extractTimeline(doc)

    # Conflict type
    conflict_type = extractConflictType(doc,
                                        upper_cutoff=95,
                                        lower_cutoff=70,
                                        step=5,
                                        window_size=3)

    # Extract problems
    problems = extractProblems(doc,
                               taxonomy,
                               regions,
                               actors,
                               timeline,
                               conflict_type,
                               score_cutoff=90)

    return problems

def saveFiles(data, output_file, file_name, save_csv=False, save_json=False):
    """
    Saves extracted data to CSV and/or JSON.
    Appends to existing files and prints a confirmation message.
    """
    if save_csv:
      df = pd.DataFrame(data)
      df.to_csv(output_file, mode='a', header=False, index=False)
      print(f"✅ Saved to {file_name}")
    if save_json:
      with open(output_file, 'a') as json_file:
          for entry in data:  # Iterate through the list of dictionaries
                  json.dump(entry, json_file, indent=4)
                  json_file.write('\n')
      print(f"✅ Saved to {file_name}")

def analyze_article(url: str, name: str, project_path: str):
    """
    Main pipeline for:
    1. Validating and extracting article content.
    2. Processing the text with SpaCy.
    3. Extracting structured conflict-related data.
    4. Saving results to CSV/JSON.
    """
    # Validate the URL
    if not validators.url(url):
        print(f"❌ Invalid URL: {url}")
        return

    text = extract_text_from_url(url)
    if not text.strip():
        print(f"❌ Failed to extract from URL: {url}")
        return

    print("Starting text processing.")
    doc = nlp(text)
    extracted = extract_entities_and_context(doc, taxonomy)

    # Adding link to the entry.
    for entry in extracted:
        entry["link"] = url

    if extracted:
      print("------Found new entries------")

      # Generate timestamped filenames.
      timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
      csv_name = f"{name}_problems_{timestamp}.csv"
      json_name = f"{name}_problems_{timestamp}.json"
      csv_file = os.path.join(project_path, csv_name)
      json_file = os.path.join(project_path, json_name)

      # Want to save as .csv?
      #input_csv = input("Do you want to save the results to a CSV file? (y/n) ")
      input_csv = "y"
      if input_csv.lower().strip() == "y":
        ensure_file_exists(csv_file)
        saveFiles(extracted, csv_file, csv_name, save_csv=True)
      else:
        print("❌ Skipping saving to CSV.")

      # Want to save as .json?
      #input_json = input("Do you want to save the results to a JSON file? (y/n) ")
      input_json = "y"
      if input_json.lower().strip() == "y":
        ensure_file_exists(json_file)
        saveFiles(extracted, json_file, json_name, save_json=True)
      else:
        print("❌ Skipping saving to JSON.")

    else:
        print("No new entries found.")

**Run code.**

In [None]:
if __name__ == "__main__":

    # What do you want to call this problem compilation?
    name = "GazaIsrael"

    # Where do you want to store your problems?
    folder = "problems1"

    # Which article do you want to analyze? Examples below.
    #url = "https://en.wikipedia.org/wiki/South_Sudanese_Civil_War"
    #url = "https://en.wikipedia.org/wiki/Russo-Ukrainian_War"
    url = "https://www.bbc.com/news/articles/cx2vz02e7g8o"

    # Let's analyze!
    full_path = os.path.join(full_project_path, folder)
    analyze_article(url, name, full_path)

Starting text processing.
------Found new entries------
✅ Created file: /content/drive/MyDrive/Colab Notebooks/Crafting Tech/problems1/GazaIsrael_problems_20250504_150108.csv
✅ Saved to GazaIsrael_problems_20250504_150108.csv
✅ Created file: /content/drive/MyDrive/Colab Notebooks/Crafting Tech/problems1/GazaIsrael_problems_20250504_150108.json
✅ Saved to GazaIsrael_problems_20250504_150108.json
