In [25]:
import re
import os

# Pronouns with types
expanded_pronouns = {
    "singular": ["মই", "তেওঁ", "সি", "তিনিও", "তাই", "তেওঁক", "তেওঁৰ", "তেওঁলৈ", "তেওঁত", "আপুনি", "আপোনাৰ", "আপুনিও"],
    "plural": ["তেওঁলোক", "তেওঁসকল", "সিহঁত", "তেওঁলোকে", "তিনিওঁ", "আমি", "আমাক", "আমাৰ", "আপোনালোকে"]
}
all_pronouns = sum(expanded_pronouns.values(), [])

# Honorifics
expanded_honorifics = ["ড০", "শ্ৰী", "শ্ৰীমতী", "মিঃ", "চাৰ", "মিছ", "শ্ৰীমান", "মহাশয়"]

# Fallback lists
fallback_names = ["ৰবি", "পিয়ুষ", "ৰিজু", "ত্ৰিনয়ন", "প্ৰতিশা", "ৰোহিত", "হৃষীকেশ", "বিজয়", "সঞ্জয়", "সুবর্ণা", "সুব্রত", "সৌম্য", "প্ৰিয়া", "সৌমিতা", "পাৰ্থ", "প্ৰণৱ"]
fallback_locations = ["গুৱাহাটী", "গাঁও", "অসম", "শিলচৰ", "নগাঁও", "ডিব্ৰুগড়", "জোৰহাট", "কোকৰাঝাড়", "বোকাখাট", "বঙাইগাঁও", "কামৰূপ", "কাছাৰ"]
fallback_organizations = ["গুৱাহাটী বিশ্ববিদ্যালয়", "অসম চৰকাৰ", "অসম বিশ্ববিদ্যালয়", "আই আই টি গুৱাহাটী", "ডিব্ৰুগড় বিশ্ববিদ্যালয়", "শিলচৰ কলেজ", "কোকৰাঝাড় বিশ্ববিদ্যালয়", "পুথিভঁৰাল", "অসম সাহিত্য সভা", "অসমীয়া সাহিত্য সংসদ", "অসমীয়া সাহিত্য পরিষদ", "অসমীয়া সাহিত্য সোসাইটি"]

# Optional gender map
name_gender_map = {
    "ৰবি": "male", "পিয়ুষ": "male", "প্ৰিয়া": "female", "সুবৰ্ণা": "female", "সৌমিতা": "female", "প্ৰতিশা": "female"
}
pronoun_gender_map = {
    "তাই": "female", "তেওঁ": "unknown", "সি": "male", "তিনিও": "unknown", "তেওঁক": "unknown", "তেওঁৰ": "unknown"
}

def load_list_from_file(filename, fallback_list):
    if os.path.exists(filename):
        with open(filename, encoding='utf-8') as f:
            loaded = [line.strip() for line in f if line.strip()]
        if loaded:
            return loaded
    return fallback_list

assamese_names = load_list_from_file('names.txt', fallback_names)
assamese_locations = load_list_from_file('locations.txt', fallback_locations)
assamese_organizations = load_list_from_file('organizations.txt', fallback_organizations)

def split_sentences(text):
    return re.split(r'(?<=[।!?])\s*', text.strip())

def extract_named_entities(sentence):
    entities = []
    words = sentence.split()
    i = 0
    while i < len(words):
        word = words[i]
        # Multi-word ORG detection
        matched = False
        for org in assamese_organizations:
            org_parts = org.split()
            if words[i:i + len(org_parts)] == org_parts:
                entities.append((org, i, "ORG"))
                i += len(org_parts)
                matched = True
                break
        if matched:
            continue
        if any(word.startswith(h) for h in expanded_honorifics):
            entities.append((word, i, "HON"))
        elif word in assamese_names:
            entities.append((word, i, "PER"))
        elif word in assamese_locations:
            entities.append((word, i, "LOC"))
        # Detect group person entities by suffix (e.g., "ছাত্ৰসকল")
        elif word.endswith("সকল") or word.endswith("লোক"):
            entities.append((word, i, "GROUP_PER"))
        i += 1
    return entities

def extract_pronouns(sentence):
    words = sentence.split()
    pronouns = []
    for i, word in enumerate(words):
        for typ, pron_list in expanded_pronouns.items():
            if word in pron_list:
                pronouns.append((word, i, typ))
    return pronouns

def is_plural_person_entity(entity_word, entity_type):
    # Detect group person entities by type or suffix
    if entity_type == "GROUP_PER":
        return True
    # You can add more heuristics here for group detection
    return False

def rule_based_coref(text):
    sentences = split_sentences(text)
    entity_positions = []
    word_index = 0
    clusters = []
    entity_to_cluster = {}

    # Collect entities
    for sent in sentences:
        entities = extract_named_entities(sent)
        for ent, pos, typ in entities:
            abs_pos = word_index + pos
            entity = (ent, abs_pos, typ)
            entity_positions.append(entity)
        word_index += len(sent.split())

    word_index = 0
    for sent in sentences:
        pronouns = extract_pronouns(sent)
        for pronoun, pos, pron_type in pronouns:
            abs_pos = word_index + pos
            # Only consider previous entities of type PER or HON for singular,
            # and GROUP_PER for plural
            prev_entities = []
            if pron_type == "singular":
                prev_entities = [e for e in entity_positions if e[1] < abs_pos and e[2] in ("PER", "HON")]
            elif pron_type == "plural":
                prev_entities = [e for e in entity_positions if e[1] < abs_pos and is_plural_person_entity(e[0], e[2])]
            if not prev_entities:
                continue

            # Gender constraint (optional)
            pron_gender = pronoun_gender_map.get(pronoun, "unknown")
            for ent in reversed(prev_entities):
                ent_word, ent_pos, ent_type = ent
                ent_gender = name_gender_map.get(ent_word, "unknown")
                if pron_gender != "unknown" and ent_gender != "unknown" and pron_gender != ent_gender:
                    continue

                # Cluster linking
                if ent in entity_to_cluster:
                    entity_to_cluster[ent].append((pronoun, abs_pos, pron_type))
                else:
                    cluster = [ent, (pronoun, abs_pos, pron_type)]
                    clusters.append(cluster)
                    entity_to_cluster[ent] = cluster
                break
        word_index += len(sent.split())

    return clusters

if __name__ == "__main__":
    text = "ছাত্ৰসকল পাঠশালাত গৈছিল। তেওঁলোক ভাল মানুহ। শান্তি ঘৰলৈ আগতিল। তেওঁ মেলা চাবলৈ বক্তৃতা দিছিল।"
    clusters = rule_based_coref(text)

    print("Coreference Clusters:")
    for i, cluster in enumerate(clusters, 1):
        print(f"\nCluster {i}:")
        for word, idx, typ in cluster:
            print(f" - {word} (position {idx}, type {typ})")


Coreference Clusters:

Cluster 1:
 - ছাত্ৰসকল (position 0, type GROUP_PER)
 - তেওঁলোক (position 3, type plural)

Cluster 2:
 - শান্তি (position 6, type PER)
 - তেওঁ (position 9, type singular)
