In [None]:
!pip install -U spacy[transformers]
!python -m spacy download en_core_web_trf

In [None]:
import spacy
spacy.require_gpu()
print("Using GPU:", spacy.prefer_gpu())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#secondary part
import json

nlp = spacy.load("en_core_web_trf")

input_json_path = "/content/drive/MyDrive/ack_clen_newline_final.json"
output_json_path = "/content/drive/MyDrive/new_ner_output.json"

with open(input_json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

for entry_id, entry in data.items():
    ack_text = entry.get("AckSection", "").strip()

    if ack_text:
        doc = nlp(ack_text)
        orgs = list(set(ent.text.strip() for ent in doc.ents if ent.label_ == "ORG"))
        persons = list(set(ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"))

        entry["ner"] = {
            "orgs": orgs,
            "person": persons
        }

    else:
        entry["ner"] = {
            "orgs": [],
            "person": []
        }

    if "entity_roles" in entry:
        del entry["entity_roles"]

with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("done")


In [None]:
folder_path = "/content/drive/MyDrive/texts"

In [None]:
#original ackextract
import re
import json
import os

nlp = spacy.load("en_core_web_trf")

ACK_START_PATTERNS = [
    r'^acknowledg(e)?ments?\.$',
    r'^acknowledg(e)?ments?[\.:]?\s*$',
    r'^acknowledg(e)?ments?[\.:]?\s+',
    r'^\**acknowledg(e)?ments?\**[\.:]?\s*$',
    r'^section\s*\d+[:\.\)]?\s*acknowledg(e)?ments?$',
    r'^\d+\.\s*acknowledg(e)?ments?$',
    r'^acknowledg(e)?ments?\s*\n',

]
SECTION_STOP_WORDS = [
    "references", "bibliography", "appendix", "abstract"
]

def is_acknowledgment_start(sent):
    stripped = sent.text.strip().lower()
    return any(re.match(p, stripped) for p in ACK_START_PATTERNS)

def is_section_end(sent):
    return sent.text.strip().lower() in SECTION_STOP_WORDS

def extract_acknowledgments(text):
    text = text.replace('\n', ' ')
    doc = nlp(text)
    ack_sentences = []
    collecting = False


    for sent in doc.sents:
        stripped = sent.text.strip()
        stripped_lower = stripped.lower()

        if is_acknowledgment_start(sent) or stripped_lower.startswith("acknowledg"):
            collecting = True
            ack_sentences.append(sent)
            continue


        if collecting:
            if is_section_end(sent):
                break
            if re.match(r'^[1-9]\d*\.', stripped):
                break
            if len(stripped.split()) < 3:
                break
            ack_sentences.append(sent)

    if not ack_sentences:
        return None, [], []


    ack_text = ' '.join(sent.text for sent in ack_sentences)


    ack_doc = nlp(ack_text)

    orgs = list(set(ent.text.strip() for ent in ack_doc.ents if ent.label_ == "ORG"))
    persons = list(set(ent.text.strip() for ent in ack_doc.ents if ent.label_ == "PERSON"))

    return ack_text.strip(), orgs, persons

def extract_ack_from_tail(text):
    text = re.sub(r'[ \t]+', ' ', text)

    ref_match = re.search(r'\n\n(?:references?|bibliography)\b.*', text, flags=re.IGNORECASE | re.DOTALL)
    if not ref_match:
        return None

    ref_start = ref_match.start()

    ack_match = None
    for match in re.finditer(r'\n\nacknowledg(e)?ments?\b.*', text[:ref_start], flags=re.IGNORECASE | re.DOTALL):
        ack_match = match

    if not ack_match:
        return None

    ack_start = ack_match.start()
    ack_text = text[ack_start:ref_start]

    cleaned_ack_text = ' '.join(ack_text.split())

    return cleaned_ack_text

def extract_ner_org_affiliations(text_block):
    doc = nlp(text_block)
    return list(set(ent.text.strip() for ent in doc.ents if ent.label_ == "ORG"))

def extract_ner_prs_affiliations(text_block):
    doc = nlp(text_block)
    return list(set(ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"))

def process_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()

    ack_section, orgs, persons = extract_acknowledgments(raw_text)

    if ack_section is None:
        ack_section = extract_ack_from_tail(raw_text)
        if ack_section is not None:
          orgs = extract_ner_org_affiliations(ack_section)
          persons = extract_ner_prs_affiliations(ack_section)
        else:
          orgs = []
          persons = []

    return {
        "ner": {
            "orgs": orgs,
            "person": persons
        },
        "AckSection": ack_section
    }


def process_folder(folder_path):
    results = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            results[filename] = process_file(file_path)
    return results


output = process_folder(folder_path)


with open("/content/drive/MyDrive/newAck.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)
