<a href="https://colab.research.google.com/github/MensureLab/DandaraData/blob/main/DandaraData_Rev3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [4]:
#LABORATÓRIO: MensureLab - UFPA.
#AUTORES: @RamiroKord; @Josafha-pereira (GitHub)

from rapidfuzz import fuzz, process, utils
from rapidfuzz.distance import Levenshtein
import pandas as pd
import re
import unicodedata
from typing import List, Dict, Optional, Set, Tuple
from collections import defaultdict
import sys
from contextlib import redirect_stdout
import io
import csv


# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def load_communities_csv(file_path: str) -> pd.DataFrame:
    """Load communities CSV file."""
    return pd.read_csv(file_path, sep=';', encoding='latin-1')


def load_articles_csv(file_path: str) -> pd.DataFrame:
    """Load articles CSV file."""
    return pd.read_csv(file_path, sep=';', encoding='utf-8')


def get_state_name(uf: str) -> str:
    """Map UF codes to full state names."""
    UF_STATES = {
        "ac": "acre", "al": "alagoas", "am": "amazonas", "ap": "amapá",
        "ba": "bahia", "ce": "ceará", "df": "distrito federal", "es": "espírito santo",
        "go": "goiás", "ma": "maranhão", "mg": "minas gerais", "ms": "mato grosso do sul",
        "mt": "mato grosso", "pa": "pará", "pb": "paraíba", "pe": "pernambuco",
        "pi": "piauí", "pr": "paraná", "rj": "rio de janeiro", "rn": "rio grande do norte",
        "ro": "rondônia", "rr": "roraima", "rs": "rio grande do sul", "sc": "santa catarina",
        "se": "sergipe", "sp": "são paulo", "to": "tocantins"
    }
    return UF_STATES.get(uf.strip().lower(), "")


def clean_text(text: str) -> str:
    """Clean and normalize text."""
    return re.sub(r'\s+', ' ', text).strip().lower()


def remove_diacritics(text: str) -> str:
    """Remove diacritical marks from text."""
    normalized = unicodedata.normalize('NFD', text)
    return ''.join(c for c in normalized if not unicodedata.combining(c))


def normalize_apostrophes(text: str) -> List[str]:
    """Create variants with different apostrophe treatments."""
    if not any(char in text for char in "'`´"):
        return [text]

    apostrophe_pattern = re.compile(r'[\'`´]')
    variants = []

    # Standard apostrophe, backtick, acute accent, space, removed
    for replacement in ["'", "`", "´", " ", ""]:
        variant = apostrophe_pattern.sub(replacement, text)
        if replacement == " ":
            variant = re.sub(r'\s+', ' ', variant).strip()
        variants.append(variant)

    return variants


def normalize_hyphens(text: str) -> Tuple[str, str]:
    """Create hyphenated and non-hyphenated versions."""
    hyphenated = re.sub(r'\s+', '-', text)
    non_hyphenated = re.sub(r'-', ' ', text)
    non_hyphenated = re.sub(r'\s+', ' ', non_hyphenated).strip()
    return hyphenated, non_hyphenated


def split_communities(text: str) -> List[str]:
    """Split community names by delimiters."""
    cleaned_text = clean_text(text)
    pattern = re.compile(r'\b[eE]\b|[(),:/]')

    e_count = sum(1 for m in re.finditer(r'\b[eE]\b', cleaned_text))

    # Special case: exactly two names separated by a single "E"
    if e_count == 1 and "," not in cleaned_text:
        parts = [part.strip() for part in pattern.split(cleaned_text) if part.strip()]
        if len(parts) == 2:
            return parts + [cleaned_text]

    return [part.strip() for part in pattern.split(cleaned_text) if part.strip()]


def split_municipalities(text: str) -> List[str]:
    """Split municipality names by delimiters."""
    return [m.strip() for m in re.split(r'[|/]', text)]


def clean_article_text(text: str) -> str:
    """Clean article text for searching."""
    cleaned_text = re.sub(r'[^\w\s]', ' ', str(text)).lower()
    return re.sub(r'\s+', ' ', cleaned_text).strip()


# =============================================================================
# DATA STRUCTURES
# =============================================================================

class Community:
    """Represents a quilombola community."""

    def __init__(self, id: int, name: str, municipality: str, uf: str, state: str, region: str,
                 original_municipality_str: str = None):
        self.id = id
        self.name = name
        self.municipality = municipality
        self.uf = uf
        self.state = state
        self.region = region
        self.original_municipality_str = original_municipality_str

    def __str__(self) -> str:
        return f"{self.name.title()}, {self.municipality.title()} ({self.uf.upper()})"


class Article:
    """Represents an academic article with search capabilities."""

    def __init__(self, id: int, title: str, title_alt1: str, title_alt2: str,
                 keywords: str, keywords_alt: str, abstract: str, abstract_alt1: str, abstract_alt2: str,
                 institution: str):
        self.id = id
        self.title = title
        self.title_alt1 = title_alt1
        self.title_alt2 = title_alt2
        self.keywords = keywords
        self.keywords_alt = keywords_alt
        self.abstract = abstract
        self.abstract_alt1 = abstract_alt1
        self.abstract_alt2 = abstract_alt2
        self.institution = institution
        self._full_text = None
        self._normalized_full_text = None
        self._terms_dict = None
        self._normalized_terms_dict = None

    def get_full_text(self) -> str:
        """Get concatenated text of all fields."""
        if self._full_text is None:
            self._full_text = " ".join([
                self.title, self.title_alt1, self.title_alt2,
                self.keywords, self.keywords_alt, self.abstract,
                self.abstract_alt1, self.abstract_alt2
            ])
        return self._full_text

    def get_normalized_full_text(self) -> str:
        """Get full text with diacritics removed."""
        if self._normalized_full_text is None:
            self._normalized_full_text = remove_diacritics(self.get_full_text())
        return self._normalized_full_text

    def _build_terms_dict(self, text: str) -> Dict[str, bool]:
        """Build terms dictionary for fast lookup."""
        terms_dict = defaultdict(bool)

        # Single words
        words = re.findall(r'\b\w+\b', text.lower())
        for word in words:
            terms_dict[word] = True

        # Multi-word phrases
        phrases = re.findall(r'\b[\w\s-]+\b', text.lower())
        for phrase in phrases:
            phrase = phrase.strip()
            if ' ' in phrase or '-' in phrase:
                terms_dict[phrase] = True

                # Add hyphen variants
                hyphenated, non_hyphenated = normalize_hyphens(phrase)
                terms_dict[hyphenated] = True
                terms_dict[non_hyphenated] = True

                # Add apostrophe variants
                if any(char in phrase for char in "'`´"):
                    for variant in normalize_apostrophes(phrase):
                        terms_dict[variant] = True

        return terms_dict

    def get_terms_dict(self) -> Dict[str, bool]:
        """Get terms dictionary for normal text."""
        if self._terms_dict is None:
            self._terms_dict = self._build_terms_dict(self.get_full_text())
        return self._terms_dict

    def get_normalized_terms_dict(self) -> Dict[str, bool]:
        """Get terms dictionary for normalized text."""
        if self._normalized_terms_dict is None:
            self._normalized_terms_dict = self._build_terms_dict(self.get_normalized_full_text())
        return self._normalized_terms_dict

    def has_term(self, term: str) -> bool:
        """Check if term exists in article."""
        # Direct lookup
        if self.get_terms_dict().get(term, False):
            return True

        # Try variants for complex terms
        if '-' in term or ' ' in term or any(char in term for char in "'`´"):
            # Hyphen variants
            if '-' in term or ' ' in term:
                hyphenated, non_hyphenated = normalize_hyphens(term)
                if (self.get_terms_dict().get(hyphenated, False) or
                    self.get_terms_dict().get(non_hyphenated, False)):
                    return True

            # Apostrophe variants
            if any(char in term for char in "'`´"):
                for variant in normalize_apostrophes(term):
                    if self.get_terms_dict().get(variant, False):
                        return True

            # Fallback to substring search
            hyphenated, non_hyphenated = normalize_hyphens(term)
            return (hyphenated in self.get_full_text() or
                   non_hyphenated in self.get_full_text())

        return False

    def has_term_diacritical_insensitive(self, term: str) -> bool:
        """Check if term exists ignoring diacritics."""
        normalized_term = remove_diacritics(term)

        # Direct lookup
        if self.get_normalized_terms_dict().get(normalized_term, False):
            return True

        # Try variants for complex terms
        if '-' in normalized_term or ' ' in normalized_term or any(char in normalized_term for char in "'`´"):
            # Hyphen variants
            if '-' in normalized_term or ' ' in normalized_term:
                hyphenated, non_hyphenated = normalize_hyphens(normalized_term)
                if (self.get_normalized_terms_dict().get(hyphenated, False) or
                    self.get_normalized_terms_dict().get(non_hyphenated, False)):
                    return True

            # Apostrophe variants
            if any(char in normalized_term for char in "'`´"):
                for variant in normalize_apostrophes(normalized_term):
                    if self.get_normalized_terms_dict().get(variant, False):
                        return True

            # Fallback to substring search
            hyphenated, non_hyphenated = normalize_hyphens(normalized_term)
            return (hyphenated in self.get_normalized_full_text() or
                   non_hyphenated in self.get_normalized_full_text())

        return False


class RegionalMatch:
    """Represents a regional quilombo mention."""

    def __init__(self, internal_region: str, state_name: str, uf: str, country_region: str,
                 community_term: str, all_regions_term: str = None):
        self.internal_region = internal_region
        self.state_name = state_name
        self.uf = uf
        self.country_region = country_region
        self.community_term = community_term
        self.all_regions_term = all_regions_term

    def get_community_description(self) -> str:
        """Get community description for CSV."""
        if self.internal_region == "todas as regiões":
            return f"COMUNIDADES DE TODAS AS REGIÕES DE {self.state_name.upper()}"
        return f"COMUNIDADES DE {self.internal_region.upper()} DE {self.state_name.upper()}"

    def get_report_text(self, article: Article) -> str:
        """Get report text for output."""
        if self.internal_region == "todas as regiões":
            return (f"Quilombos de todas as regiões de {self.state_name.title()}, "
                   f"na região {self.country_region.title()},\n"
                   f"são mencionados no artigo de ID {article.id + 2} da tabela,\n"
                   f"estudados pela instituição {article.institution.title()}.\n")
        return (f"Quilombos do {self.internal_region} de {self.state_name.title()}, "
               f"na região {self.country_region.title()},\n"
               f"são mencionados no artigo de ID {article.id + 2} da tabela,\n"
               f"estudados pela instituição {article.institution.title()}.\n")


# =============================================================================
# CORE MATCHING LOGIC
# =============================================================================

class QuilombolaAnalyzer:
    """Main analyzer class with all matching logic consolidated."""

    # Constants
    AMBIGUOUS_NAMES = {
        "frança", "ovo", "um", "kalunga", "base", "peixes", "forte",
        "alto", "piauí", "floresta", "américa", "brasileira",
        "araçá", "jatobá", "jurema", "aroeira", "pilões", "piloes",
        "matá", "mocambo", "solidão", "crioulo", "corte", "palmeiras"
    }

    DISAMBIGUATORS = {
        "comunidade", "comunidade quilombola", "comunidade remanescente",
        "quilombo", "comunidade kilombola", "kilombo", "crq", "crqs",
        "território quilombola", "território kilombola", "povoado"
    }

    EXCLUDED_TERMS = {"quilombo", "quilombolas", "quilombo "}
    PARA_STATE = "pará"

    # Regional matcher constants
    COMMUNITY_TERMS = {
        "quilombos", "comunidades", "quilombolas", "territórios", "povoados",
        "assentamentos", "kilombos", "kilombolas"
    }

    PREPOSITIONS = {"de", "da", "do", "em", "na", "no", "dos", "das"}

    INTERNAL_REGIONS = {
        "norte", "sul", "leste", "oeste", "nordeste", "sudeste", "sudoeste",
        "noroeste", "centro", "interior", "sertão", "litoral", "serra",
        "vale", "região", "regiões", "recôncavo"
    }

    ALL_REGIONS_TERMS = {
        "todas as regiões", "toda a região", "todas regiões", "toda região",
        "diversas regiões", "várias regiões", "múltiplas regiões"
    }

    STATE_NAMES = {
        "acre", "alagoas", "amazonas", "amapá", "bahia", "ceará",
        "distrito federal", "espírito santo", "goiás", "maranhão",
        "minas gerais", "mato grosso do sul", "mato grosso", "pará",
        "paraíba", "pernambuco", "piauí", "paraná", "rio de janeiro",
        "rio grande do norte", "rondônia", "roraima", "rio grande do sul",
        "santa catarina", "sergipe", "são paulo", "tocantins"
    }

    GENTILIC_TO_STATE = {
        "acreano": ("acre", "ac"), "acriano": ("acre", "ac"),
        "alagoano": ("alagoas", "al"), "amazonense": ("amazonas", "am"),
        "amapaense": ("amapá", "ap"), "baiano": ("bahia", "ba"),
        "baiense": ("bahia", "ba"), "cearense": ("ceará", "ce"),
        "brasiliense": ("distrito federal", "df"), "capixaba": ("espírito santo", "es"),
        "espírito santense": ("espírito santo", "es"), "goiano": ("goiás", "go"),
        "maranhense": ("maranhão", "ma"), "mineiro": ("minas gerais", "mg"),
        "sul-mato-grossense": ("mato grosso do sul", "ms"),
        "mato-grossense": ("mato grosso", "mt"), "paraense": ("pará", "pa"),
        "parauara": ("pará", "pa"), "paraoara": ("pará", "pa"),
        "paraibano": ("paraíba", "pb"), "pernambucano": ("pernambuco", "pe"),
        "piauiense": ("piauí", "pi"), "paranaense": ("paraná", "pr"),
        "fluminense": ("rio de janeiro", "rj"), "potiguar": ("rio grande do norte", "rn"),
        "norte-rio-grandense": ("rio grande do norte", "rn"),
        "rio-grandense-do-norte": ("rio grande do norte", "rn"),
        "rondoniense": ("rondônia", "ro"), "rondoniano": ("rondônia", "ro"),
        "roraimense": ("roraima", "rr"), "gaúcho": ("rio grande do sul", "rs"),
        "sul-rio-grandense": ("rio grande do sul", "rs"),
        "rio-grandense-do-sul": ("rio grande do sul", "rs"),
        "catarinense": ("santa catarina", "sc"), "barriga verde": ("santa catarina", "sc"),
        "sergipano": ("sergipe", "se"), "paulista": ("são paulo", "sp"),
        "tocantinense": ("tocantins", "to")
    }

    STATE_TO_UF = {
        "acre": "ac", "alagoas": "al", "amazonas": "am", "amapá": "ap",
        "bahia": "ba", "ceará": "ce", "distrito federal": "df", "espírito santo": "es",
        "goiás": "go", "maranhão": "ma", "minas gerais": "mg", "mato grosso do sul": "ms",
        "mato grosso": "mt", "pará": "pa", "paraíba": "pb", "pernambuco": "pe",
        "piauí": "pi", "paraná": "pr", "rio de janeiro": "rj", "rio grande do norte": "rn",
        "rondônia": "ro", "roraima": "rr", "rio grande do sul": "rs", "santa catarina": "sc",
        "sergipe": "se", "são paulo": "sp", "tocantins": "to"
    }

    UF_TO_REGION = {
        "ac": "norte", "al": "nordeste", "am": "norte", "ap": "norte",
        "ba": "nordeste", "ce": "nordeste", "df": "centro-oeste", "es": "sudeste",
        "go": "centro-oeste", "ma": "nordeste", "mg": "sudeste", "ms": "centro-oeste",
        "mt": "centro-oeste", "pa": "norte", "pb": "nordeste", "pe": "nordeste",
        "pi": "nordeste", "pr": "sul", "rj": "sudeste", "rn": "nordeste",
        "ro": "norte", "rr": "norte", "rs": "sul", "sc": "sul",
        "se": "nordeste", "sp": "sudeste", "to": "norte"
    }

    def __init__(self, communities_file: str, articles_file: str,
                 output_txt_file: str = "resultados.txt",
                 output_csv_file: str = "resultados_detalhados.csv",
                 articles_copy_file: str = "artigos_final.csv"):
        self.communities_file = communities_file
        self.articles_file = articles_file
        self.output_txt_file = output_txt_file
        self.output_csv_file = output_csv_file
        self.articles_copy_file = articles_copy_file

        # Load data
        self.communities = self._load_communities()
        self.articles = self._load_articles()
        self.communities_df = load_communities_csv(communities_file)
        self.articles_df = load_articles_csv(articles_file)

        # Results tracking
        self.article_community_matches = []  # (article_id, community_id, matched_community_name)
        self.regional_matches = []  # (article_id, RegionalMatch)
        self.csv_results = []  # (community, article) pairs for CSV export

        # Compile regional patterns
        self._compile_regional_patterns()

    def _load_communities(self) -> List[Community]:
        """Load and process communities data."""
        df = load_communities_csv(self.communities_file)
        communities = []

        df['uf_clean'] = df.iloc[:, 2].apply(lambda x: x.strip().lower())
        df['state'] = df['uf_clean'].apply(get_state_name)

        for idx, row in df.iterrows():
            names = split_communities(clean_text(row.iloc[5]))
            original_muni_str = row.iloc[3].strip().lower()
            municipalities = split_municipalities(original_muni_str)

            uf = row['uf_clean']
            state = row['state']
            region = row.iloc[1].strip().lower()

            for name in names:
                for municipality in municipalities:
                    community = Community(
                        id=idx, name=name, municipality=municipality,
                        uf=uf, state=state, region=region,
                        original_municipality_str=original_muni_str
                    )
                    communities.append(community)

        return communities

    def _load_articles(self) -> List[Article]:
        """Load and process articles data."""
        df = load_articles_csv(self.articles_file)

        # Preprocess text fields
        for col in [21, 22, 23, 28, 29, 32, 33, 34, 17]:
            col_name = f'clean_col_{col}'
            df[col_name] = df.iloc[:, col].apply(clean_article_text)

        articles = []
        for index, row in df.iterrows():
            article = Article(
                id=index,
                title=row[f'clean_col_21'],
                title_alt1=row[f'clean_col_22'],
                title_alt2=row[f'clean_col_23'],
                keywords=row[f'clean_col_28'],
                keywords_alt=row[f'clean_col_29'],
                abstract=row[f'clean_col_32'],
                abstract_alt1=row[f'clean_col_33'],
                abstract_alt2=row[f'clean_col_34'],
                institution=row[f'clean_col_17']
            )
            articles.append(article)

        return articles

    def _compile_regional_patterns(self):
        """Compile regex patterns for regional matching."""
        community_terms = "|".join(self.COMMUNITY_TERMS)
        prepositions = "|".join(self.PREPOSITIONS)
        internal_regions = "|".join(self.INTERNAL_REGIONS)
        state_names = "|".join(self.STATE_NAMES)
        gentilics = "|".join(self.GENTILIC_TO_STATE.keys())
        all_regions_terms = "|".join(self.ALL_REGIONS_TERMS)

        adjective_pattern = r'(?:\s+(?!(?:' + prepositions + r'|' + internal_regions + r'|' + state_names + r')\b)\w+){0,2}'

        # Four regional patterns
        self.regional_patterns = [
            re.compile(rf'\b({community_terms}){adjective_pattern}\s+({prepositions})\s+({internal_regions})\s+({prepositions})\s+({state_names})\b', re.IGNORECASE),
            re.compile(rf'\b({community_terms}){adjective_pattern}\s+({prepositions})\s+({internal_regions})\s+({gentilics})\b', re.IGNORECASE),
            re.compile(rf'\b({community_terms}){adjective_pattern}\s+({prepositions})\s+({all_regions_terms})\s+({prepositions})\s+({state_names})\b', re.IGNORECASE),
            re.compile(rf'\b({community_terms}){adjective_pattern}\s+({prepositions})\s+({all_regions_terms})\s+({gentilics})\b', re.IGNORECASE)
        ]

    # =============================================================================
    # HIERARCHY MANAGEMENT
    # =============================================================================

    def find_hierarchy_for_community(self, target_community: Community) -> List[Community]:
        """Find hierarchical relationships for a community."""
        # Step 1: Same municipality
        municipality_hierarchy = self._find_hierarchy_in_scope(
            target_community,
            lambda c: (c.municipality.lower() == target_community.municipality.lower() and
                      c.uf.lower() == target_community.uf.lower())
        )

        if len(municipality_hierarchy) > 1:
            return sorted(municipality_hierarchy, key=lambda c: len(c.name))

        # Step 2: Same state
        state_hierarchy = self._find_hierarchy_in_scope(
            target_community,
            lambda c: c.uf.lower() == target_community.uf.lower()
        )

        if len(state_hierarchy) > 1:
            return sorted(state_hierarchy, key=lambda c: len(c.name))

        return [target_community]

    def _find_hierarchy_in_scope(self, target_community: Community, scope_filter) -> List[Community]:
        """Find hierarchy within a specific scope."""
        scope_communities = [c for c in self.communities if scope_filter(c)]
        hierarchical_communities = []

        for community in scope_communities:
            if self._has_word_boundary_relationship(target_community, community):
                hierarchical_communities.append(community)

        return hierarchical_communities

    def _has_word_boundary_relationship(self, target_community: Community, candidate_community: Community) -> bool:
        """Check if two communities have a hierarchical relationship."""
        if (target_community.id == candidate_community.id and
            target_community.name == candidate_community.name):
            return True

        target_variants = self._get_normalized_variants(target_community.name)
        candidate_variants = self._get_normalized_variants(candidate_community.name)

        for target_variant in target_variants:
            for candidate_variant in candidate_variants:
                if (self._is_word_boundary_match(target_variant, candidate_variant) or
                    self._is_word_boundary_match(candidate_variant, target_variant)):
                    return True

        return False

    def _get_normalized_variants(self, community_name: str) -> Set[str]:
        """Get normalized variants of a community name."""
        variants = set()

        cleaned_name = clean_text(community_name)
        variants.add(cleaned_name)

        no_diacritics = remove_diacritics(cleaned_name)
        variants.add(no_diacritics)

        # Apostrophe variants
        apostrophe_variants = normalize_apostrophes(cleaned_name)
        variants.update(apostrophe_variants)

        apostrophe_variants_no_diacritics = normalize_apostrophes(no_diacritics)
        variants.update(apostrophe_variants_no_diacritics)

        variants.discard('')
        return variants

    def _is_word_boundary_match(self, shorter_name: str, longer_name: str) -> bool:
        """Check if shorter name appears as complete words in longer name."""
        if shorter_name == longer_name or len(shorter_name) >= len(longer_name):
            return False

        escaped_shorter = re.escape(shorter_name)
        pattern = r'\b' + escaped_shorter + r'\b'
        return bool(re.search(pattern, longer_name, re.IGNORECASE))

    # =============================================================================
    # COMMUNITY MATCHING
    # =============================================================================

    def find_best_match_in_hierarchy(self, hierarchy: List[Community], article: Article) -> Optional[Community]:
        """Find the best (longest) matching community in a hierarchy."""
        # Sort by name length (longest first) for priority
        sorted_hierarchy = sorted(hierarchy, key=lambda c: len(c.name), reverse=True)

        for community in sorted_hierarchy:
            if self._check_community_match(community, article):
                return community

        return None

    def _check_community_match(self, community: Community, article: Article) -> bool:
        """Check if a community matches an article."""
        if community.name in self.EXCLUDED_TERMS:
            return False

        # Check community name (required)
        if not article.has_term_diacritical_insensitive(community.name):
            return False

        # Apply matching logic based on ambiguity
        if community.name in self.AMBIGUOUS_NAMES:
            return self._check_ambiguous_match(community, article)
        else:
            return self._check_regular_match(community, article)

    def _check_regular_match(self, community: Community, article: Article) -> bool:
        """Check regular community matching."""
        # Check municipality first (preferred)
        if article.has_term(community.municipality):
            return True

        # Check state as fallback
        if community.state.lower() == self.PARA_STATE:
            return article.has_term(community.state)
        else:
            return article.has_term_diacritical_insensitive(community.state)

    def _check_ambiguous_match(self, community: Community, article: Article) -> bool:
        """Check ambiguous community matching (requires disambiguators)."""
        text = article.get_full_text()
        mentions = 0

        # Find community name mentions with disambiguators
        community_pattern = r'\b' + re.escape(community.name) + r'\b'
        occurrences = [m.start() for m in re.finditer(community_pattern, text, re.IGNORECASE)]

        if occurrences:
            for pos in occurrences:
                text_before = text[:pos].strip()
                words_before = text_before.split()[-4:]
                context = " ".join(words_before)

                for disamb in self.DISAMBIGUATORS:
                    if re.search(r'\b' + re.escape(disamb) + r'\b', context):
                        mentions += 1
                        break

        # Check municipality mention
        if article.has_term(community.municipality):
            mentions += 1
        # Check state mention if municipality not mentioned
        elif community.state.lower() == self.PARA_STATE:
            if article.has_term(community.state):
                mentions += 1
        elif article.has_term_diacritical_insensitive(community.state):
            mentions += 1

        return mentions > 1

    # =============================================================================
    # REGIONAL MATCHING
    # =============================================================================

    def find_regional_matches(self, article: Article) -> List[RegionalMatch]:
        """Find regional quilombo mentions in an article."""
        matches = []
        text = article.get_full_text()
        seen_combinations = set()

        # Pattern 1: community + prep + region + prep + state
        for match in self.regional_patterns[0].finditer(text):
            community_term = match.group(1).lower()
            if not community_term.endswith('s'):
                continue

            internal_region = match.group(3).lower()
            state_name = match.group(5).lower()

            uf = self.STATE_TO_UF.get(state_name)
            if uf:
                country_region = self.UF_TO_REGION.get(uf)
                unique_key = (internal_region, state_name, uf, country_region)

                if unique_key not in seen_combinations:
                    seen_combinations.add(unique_key)
                    matches.append(RegionalMatch(
                        internal_region=internal_region,
                        state_name=state_name,
                        uf=uf,
                        country_region=country_region,
                        community_term=community_term
                    ))

        # Pattern 2: community + prep + region + gentilic
        for match in self.regional_patterns[1].finditer(text):
            community_term = match.group(1).lower()
            if not community_term.endswith('s'):
                continue

            internal_region = match.group(3).lower()
            gentilic = match.group(4).lower()

            state_info = self.GENTILIC_TO_STATE.get(gentilic)
            if state_info:
                state_name, uf = state_info
                country_region = self.UF_TO_REGION.get(uf)
                unique_key = (internal_region, state_name, uf, country_region)

                if unique_key not in seen_combinations:
                    seen_combinations.add(unique_key)
                    matches.append(RegionalMatch(
                        internal_region=internal_region,
                        state_name=state_name,
                        uf=uf,
                        country_region=country_region,
                        community_term=community_term
                    ))

        # Pattern 3: community + prep + "todas as regiões" + prep + state
        for match in self.regional_patterns[2].finditer(text):
            community_term = match.group(1).lower()
            if not community_term.endswith('s'):
                continue

            all_regions_term = match.group(3).lower()
            state_name = match.group(5).lower()

            uf = self.STATE_TO_UF.get(state_name)
            if uf:
                country_region = self.UF_TO_REGION.get(uf)
                internal_region = "todas as regiões"
                unique_key = (internal_region, state_name, uf, country_region)

                if unique_key not in seen_combinations:
                    seen_combinations.add(unique_key)
                    matches.append(RegionalMatch(
                        internal_region=internal_region,
                        state_name=state_name,
                        uf=uf,
                        country_region=country_region,
                        community_term=community_term,
                        all_regions_term=all_regions_term
                    ))

        # Pattern 4: community + prep + "todas as regiões" + gentilic
        for match in self.regional_patterns[3].finditer(text):
            community_term = match.group(1).lower()
            if not community_term.endswith('s'):
                continue

            all_regions_term = match.group(3).lower()
            gentilic = match.group(4).lower()

            state_info = self.GENTILIC_TO_STATE.get(gentilic)
            if state_info:
                state_name, uf = state_info
                country_region = self.UF_TO_REGION.get(uf)
                internal_region = "todas as regiões"
                unique_key = (internal_region, state_name, uf, country_region)

                if unique_key not in seen_combinations:
                    seen_combinations.add(unique_key)
                    matches.append(RegionalMatch(
                        internal_region=internal_region,
                        state_name=state_name,
                        uf=uf,
                        country_region=country_region,
                        community_term=community_term,
                        all_regions_term=all_regions_term
                    ))

        return matches

    # =============================================================================
    # REPORTING
    # =============================================================================

    def generate_community_report(self, community: Community, article: Article) -> str:
        """Generate match report for a community."""
        has_municipality_match = article.has_term(community.municipality)

        if (not has_municipality_match and community.original_municipality_str and
            "|" in community.original_municipality_str):
            # Handle grouped municipalities
            all_municipalities = split_municipalities(community.original_municipality_str)

            if len(all_municipalities) == 1:
                muni_str = f"{all_municipalities[0].title()} ({community.uf.upper()})"
            else:
                muni_parts = [f"{m.title()} ({community.uf.upper()})" for m in all_municipalities[:-1]]
                muni_str = ", ".join(muni_parts)
                muni_str += f", e/ou {all_municipalities[-1].title()} ({community.uf.upper()})"

            prefix = "Nome ambíguo" if community.name in self.AMBIGUOUS_NAMES else "Comunidade"
            return (f"{prefix} {community.name.title()}, do município {muni_str}, "
                   f"da região {community.region.upper()},\n"
                   f"é mencionada no artigo de ID {article.id + 2} da tabela,\n"
                   f"estudada pela instituição {article.institution.title()}.\n")
        else:
            # Regular reporting
            prefix = "Nome ambíguo" if community.name in self.AMBIGUOUS_NAMES else "Comunidade"
            return (f"{prefix} {community.name.title()}, do município {community.municipality.title()} "
                   f"({community.uf.upper()}), da região {community.region.upper()},\n"
                   f"é mencionada no artigo de ID {article.id + 2} da tabela,\n"
                   f"estudada pela instituição {article.institution.title()}.\n")

    # =============================================================================
    # EXPORT FUNCTIONS
    # =============================================================================

    def export_csv(self):
        """Export detailed results to CSV."""
        headers = [
            "COMUNIDADE", "MUNICÍPIO", "UF", "REGIÃO", "AUTORES",
            "TÍTULO DO ARTIGO (IDIOMA 1)", "TÍTULO DO ARTIGO (IDIOMA 2)",
            "TÍTULO DO ARTIGO (IDIOMA 3)", "REVISTA", "PALAVRAS-CHAVE (IDIOMA 1)",
            "PALAVRAS-CHAVE (IDIOMA 2)", "RESUMO (IDIOMA 1)", "RESUMO (IDIOMA 2)",
            "RESUMO (IDIOMA 3)", "INSTITUIÇÃO"
        ]

        with open(self.output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile, delimiter=';')
            writer.writerow(headers)

            for community, article in self.csv_results:
                row = [
                    community.name.title(),
                    community.municipality.title(),
                    community.uf.upper(),
                    community.region.upper(),
                    "",  # AUTORES
                    article.title,
                    article.title_alt1,
                    article.title_alt2,
                    "",  # REVISTA
                    article.keywords,
                    article.keywords_alt,
                    article.abstract,
                    article.abstract_alt1,
                    article.abstract_alt2,
                    article.institution.title()
                ]
                writer.writerow(row)

        return len(self.csv_results)

    def create_enhanced_articles_copy(self):
        """Create enhanced articles copy with community data."""
        copy_df = self.articles_df.copy()
        all_rows = []

        # Add rows for community matches
        for article_id, community_id, matched_community_name in self.article_community_matches:
            new_row = self.articles_df.iloc[article_id].copy()
            community_row = self.communities_df.iloc[community_id]

            # Copy first 17 columns from community
            for col_idx in range(min(17, len(community_row))):
                new_row.iloc[col_idx] = community_row.iloc[col_idx]

            # Replace community name with matched name
            new_row.iloc[5] = matched_community_name.upper()
            all_rows.append(new_row)

        # Add rows for regional matches
        for article_id, regional_match in self.regional_matches:
            new_row = self.articles_df.iloc[article_id].copy()

            new_row.iloc[0] = ""  # Empty cell
            new_row.iloc[1] = regional_match.country_region.upper()
            new_row.iloc[2] = regional_match.uf.upper()
            new_row.iloc[3] = ""  # Municipality
            new_row.iloc[4] = ""  # Other field
            new_row.iloc[5] = regional_match.get_community_description()

            all_rows.append(new_row)

        # Create final dataframe
        matched_df = pd.DataFrame(all_rows, columns=copy_df.columns)

        # Get unmatched articles
        matched_article_ids = set(aid for aid, _, _ in self.article_community_matches)
        matched_article_ids.update(aid for aid, _ in self.regional_matches)

        unmatched_df = copy_df.iloc[[i for i in range(len(copy_df)) if i not in matched_article_ids]]

        # Combine and save
        final_df = pd.concat([unmatched_df, matched_df], ignore_index=True)
        final_df.to_csv(self.articles_copy_file, sep=';', encoding='utf-8', index=False)

        return len(self.article_community_matches), len(self.regional_matches), len(unmatched_df)

    def add_year_column(self):
        """Extract year from column 13 and add to last column."""
        df = pd.read_csv(self.articles_copy_file, sep=';', encoding='utf-8')

        last_four_chars = df.iloc[:, 13].astype(str).apply(
            lambda x: x[-4:] if len(x) >= 4 else x
        )

        df['ANO DA PORTARIA'] = last_four_chars
        df.to_csv(self.articles_copy_file, sep=';', encoding='utf-8', index=False)

        return len(df)

    # =============================================================================
    # MAIN ANALYSIS
    # =============================================================================

    def analyze(self) -> int:
        """Main analysis function with hierarchical matching."""
        matches_count = 0
        output_buffer = io.StringIO()

        with redirect_stdout(output_buffer):
            # Track processed items to avoid duplicates
            processed_community_ids = set()
            processed_pairs = set()
            first_round_article_ids = set()

            print("=== PRIMEIRA RODADA: COMUNIDADES ESPECÍFICAS ===\n")

            # First round: specific communities with hierarchical matching
            for community in self.communities:
                community_key = (community.id, community.name, community.municipality, community.uf)
                if community_key in processed_community_ids:
                    continue

                # Find hierarchy
                hierarchy = self.find_hierarchy_for_community(community)

                # Mark all hierarchy members as processed
                for h_community in hierarchy:
                    h_key = (h_community.id, h_community.name, h_community.municipality, h_community.uf)
                    processed_community_ids.add(h_key)

                # Create hierarchy key
                hierarchy_key = "_".join([f"{c.id}_{c.name}_{c.municipality}_{c.uf}" for c in
                                        sorted(hierarchy, key=lambda c: (c.id, c.name))])

                # Test hierarchy against all articles
                for article in self.articles:
                    pair_key = (hierarchy_key, article.id)
                    if pair_key in processed_pairs:
                        continue

                    # Find best match in hierarchy
                    best_match = self.find_best_match_in_hierarchy(hierarchy, article)

                    if best_match:
                        # Generate and print report
                        report = self.generate_community_report(best_match, article)
                        print(report)

                        # Track results
                        self.article_community_matches.append((article.id, best_match.id, best_match.name))
                        self.csv_results.append((best_match, article))

                        processed_pairs.add(pair_key)
                        first_round_article_ids.add(article.id)
                        matches_count += 1

            print("=== SEGUNDA RODADA: MENÇÕES REGIONAIS ===\n")

            # Second round: regional matches
            regional_matches_count = 0
            for article in self.articles:
                if article.id not in first_round_article_ids:
                    regional_match_list = self.find_regional_matches(article)

                    for regional_match in regional_match_list:
                        report = regional_match.get_report_text(article)
                        print(report)

                        self.regional_matches.append((article.id, regional_match))
                        regional_matches_count += 1

            print("=== RESUMO DOS RESULTADOS ===\n")
            print(f"Foram identificados {matches_count} artigos que têm "
                  f"comunidades quilombolas específicas\n"
                  f"certificadas pela Fundação Cultural Palmares como objeto de estudo.")

            print(f"Foram identificados {regional_matches_count} artigos adicionais que mencionam "
                  f"quilombos/comunidades de forma regional.")

            total_matches = matches_count + regional_matches_count
            print(f"Total de artigos com menções relevantes: {total_matches}")

        # Write output to file
        with open(self.output_txt_file, 'w', encoding='utf-8') as f:
            f.write(output_buffer.getvalue())

        # Print to console
        print(output_buffer.getvalue())

        # Export CSV
        csv_records = self.export_csv()
        print(f"CSV com {csv_records} registros detalhados exportado para: {self.output_csv_file}")

        # Create enhanced articles copy
        first_count, second_count, unmatched_count = self.create_enhanced_articles_copy()
        print(f"Nova cópia dos artigos salva em: {self.articles_copy_file}")
        print(f"  - {first_count} linhas para artigos com comunidades específicas")
        print(f"  - {second_count} linhas para artigos com menções regionais")
        print(f"  - {unmatched_count} linhas para artigos sem comunidades")

        # Add year column
        total_rows = self.add_year_column()
        print(f"Processamento concluído. Últimos quatro caracteres da coluna 13 adicionados\n"
              f"como nova coluna 'ANO DA PORTARIA' em: {self.articles_copy_file}")

        return total_matches


# =============================================================================
# MAIN FUNCTION
# =============================================================================

def main():
    """Main function for running the analysis."""
    # Mount Google Drive (specific to Google Colab)
    try:
        from google.colab import drive
        drive.mount('/content/drive/')

        # File paths for Colab
        communities_file = '/content/drive/My Drive/estudo_quilombos_organizado/crqs_atual.csv'
        articles_file = '/content/drive/My Drive/estudo_quilombos_organizado/artigos_atual.csv'
        output_txt_file = '/content/drive/My Drive/estudo_quilombos_organizado/resultados.txt'
        output_csv_file = '/content/drive/My Drive/estudo_quilombos_organizado/resultados_detalhados.csv'
        articles_copy_file = '/content/drive/My Drive/estudo_quilombos_organizado/artigos_final.csv'
    except ImportError:
        # Local file paths (fallback)
        communities_file = 'crqs_atual.csv'
        articles_file = 'artigos_atual.csv'
        output_txt_file = 'resultados.txt'
        output_csv_file = 'resultados_detalhados.csv'
        articles_copy_file = 'artigos_final.csv'

    # Run analysis
    analyzer = QuilombolaAnalyzer(
        communities_file=communities_file,
        articles_file=articles_file,
        output_txt_file=output_txt_file,
        output_csv_file=output_csv_file,
        articles_copy_file=articles_copy_file
    )

    total_matches = analyzer.analyze()

    print(f"\nAnálise concluída com {total_matches} correspondências encontradas.")
    print(f"Resultados salvos em:")
    print(f"  - Texto: {output_txt_file}")
    print(f"  - CSV detalhado: {output_csv_file}")
    print(f"  - Artigos com comunidades: {articles_copy_file}")


if __name__ == "__main__":
    main()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
=== PRIMEIRA RODADA: COMUNIDADES ESPECÍFICAS ===

Comunidade Rincão Do Couro, do município Piratini (RS), da região SUL,
é mencionada no artigo de ID 303 da tabela,
estudada pela instituição Universidade Federal Do Rio Grande.

Comunidade Contente, do município Paulistana (PI), da região NORDESTE,
é mencionada no artigo de ID 79 da tabela,
estudada pela instituição Universidade Federal Do Piauí Universidade De Iowa.

Comunidade Barro Vermelho, do município Paulistana (PI), da região NORDESTE,
é mencionada no artigo de ID 79 da tabela,
estudada pela instituição Universidade Federal Do Piauí Universidade De Iowa.

Comunidade Almeidas, do município Silvânia (GO), da região CENTRO-OESTE,
é mencionada no artigo de ID 434 da tabela,
estudada pela instituição Universidade Estadual De Goiás.

Nome ambíguo Kalunga, do município Cavalcante (GO), Monte Alegre De Goiás