In [88]:
# transform ou 50 json files to text String 
import json
import os

# Specify the folder path containing your JSON files
folder_path = "C:/Users/user/Downloads/pfe_project/assignementdataset/"

# List to store compiled text from all files
all_compiled_texts = []

# Iterate through all JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        try:
            # Construct full file path
            file_path = os.path.join(folder_path, filename)
            
            # Load the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # Extract title
            title = data.get("title", "")

            # Extract authors' names
            authors = [
                f"{author.get('first', '')} {author.get('middle', '')} {author.get('last', '')}".strip()
                for author in data.get("authors", [])
            ]

            # Extract text from pdf_parse (abstract and body_text)
            abstract = " ".join([item.get("text", "") for item in data.get("pdf_parse", {}).get("abstract", [])])
            body_text = " ".join([item.get("text", "") for item in data.get("pdf_parse", {}).get("body_text", [])])

            # Compile text for this file
            compiled_text = "\n".join([
                f"File: {filename}",
                f"Title: {title}",
                f"Authors: {', '.join(authors)}",
                f"Abstract: {abstract}",
                f"Body Text: {body_text}",
                "\n" + "="*50 + "\n"  # Separator between files
            ])

            # Add to list of all compiled texts
            all_compiled_texts.append(compiled_text)

            print(f"Successfully processed: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

# Combine all texts into one string
final_compiled_text = "\n".join(all_compiled_texts)
final_compiled_text

Successfully processed: -em-Fertility-and-Sterility--em--top-videos-from-2.grobid.tei.json
Successfully processed: A-&#x201c;first&#x201d;-on-the-horizon--the-expans.grobid.tei.json
Successfully processed: A-behind-the-scenes-look-at-retroperitoneal-ectopi.grobid.tei.json
Successfully processed: A-call-to-action-to-reproductive-endocrinologists-.grobid.tei.json
Successfully processed: A-case-report-of-retroperitoneal-ectopic-pregnancy.grobid.tei.json
Successfully processed: A-combination-of-two-novel-ligation-techniques-for.grobid.tei.json
Successfully processed: A-diagnosis-of-diminished-ovarian-reserve-does-not.grobid.tei.json
Successfully processed: A-framework-approach-for-hysteroscopic-uterine-sep.grobid.tei.json
Successfully processed: A-higher-number-of-oocytes-retrieved-is-associated.grobid.tei.json
Successfully processed: A-nail-in-the-coffin--the-antim&#xfc;llerian-hormo.grobid.tei.json
Successfully processed: A-novel-approach-using-vaginal-natural-orifice-tra.grobid.tei.json



In [91]:
# Clean our text from Html tags , Keep only alphabetic words and dates
import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Keep only alphabetic words and dates
    word_pattern = r'\b[A-Za-z]+\b'
    date_pattern = r'\b(\d{1,2}/\d{1,2}/\d{4})\b'
    
    words = re.findall(word_pattern, text)
    dates = re.findall(date_pattern, text)
    
    # Combine words and dates
    cleaned_elements = words + dates
    
    # Join the elements, preserving their original order
    cleaned_text = ' '.join(cleaned_elements)
    
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    
    return cleaned_text


text = clean_text(final_compiled_text)


In [92]:
# medical entities disease and chemical 
import spacy
import en_ner_bc5cdr_md
from spacy import displacy
import pandas as pd

# Load the biomedical NER model
nlp_bio = en_ner_bc5cdr_md.load()

def tokenize_and_label_bio(text):
    # Process with biomedical NER
    doc_bio = nlp_bio(text)
    
    tokens = []
    ner_labels = []
    pos_tags= []
    lemmas = []
    dep_labels=[]
    
    for token in doc_bio:
        tokens.append(token.text)
        
        # NER labels from biomedical model
        if token.ent_type_:
            ner_label = f"B-{token.ent_type_}" if token.i == 0 or doc_bio[token.i - 1].ent_type_ != token.ent_type_ else f"I-{token.ent_type_}"
        else:
            ner_label = 'O'
            
        ner_labels.append(ner_label)
        # POS tags, dependency labels, and lemmas from general model
        pos_tags.append(token.pos_)
        dep_labels.append(token.dep_)
        lemmas.append(token.lemma_)
    
    # Visualize entities
    displacy.render(doc_bio, style="ent", jupyter=True)
    
    return tokens, ner_labels, pos_tags, dep_labels, lemmas

# Example usage
compiled_text = text[:100000]
tokens, ner_labels, pos_tags, dep_labels, lemmas = tokenize_and_label_bio(compiled_text)




In [93]:
# save the results in a Dataframe 
df_1 = pd.DataFrame({
        'Token': tokens,
        'NER': ner_labels,
        'POS': pos_tags,
        'DEP': dep_labels,
        'Lemma': lemmas
})





In [94]:
# size of tokens 
df_1.shape

(15777, 5)

In [115]:
#medical entities 
print(set(ner_labels))

{'O', 'I-DISEASE', 'B-CHEMICAL', 'I-CHEMICAL', 'B-DISEASE'}


In [98]:
# general entities : Date , person'name , location , organization , GPE

# Load the general-purpose model
nlp_gen = spacy.load("en_core_web_sm")

def tokenize_and_label_general(text):
    doc_gen = nlp_gen(text)
    
    tokens = []
    ner_labels = []
    pos_tags = []
    dep_labels = []
    lemmas = []
    
    for token in doc_gen:
        tokens.append(token.text)
        
        if token.ent_type_ in ['PERSON', 'GPE', 'LOC', 'ORG', 'DATE']:
            ner_label = f"B-{token.ent_type_}" if token.i == 0 or doc_gen[token.i - 1].ent_type_ != token.ent_type_ else f"I-{token.ent_type_}"
        elif re.match(r'\d{1,2}/\d{1,2}/\d{4}', token.text):
            ner_label = 'B-DATE'
        else:
            ner_label = 'O'
            
        ner_labels.append(ner_label)
        pos_tags.append(token.pos_)
        dep_labels.append(token.dep_)
        lemmas.append(token.lemma_)

    
    
    
    # Visualize entities
    displacy.render(doc_gen, style="ent", jupyter=True)
    
    return tokens, ner_labels, pos_tags, dep_labels, lemmas

# Example usage
compiled_text = text[:100000]
tokens_2, ner_labels_2, pos_tags_2, dep_labels_2, lemmas_2 = tokenize_and_label_general(compiled_text)






In [99]:
# print general entities 
set(ner_labels_2)

{'B-DATE',
 'B-GPE',
 'B-LOC',
 'B-ORG',
 'B-PERSON',
 'I-DATE',
 'I-GPE',
 'I-LOC',
 'I-ORG',
 'I-PERSON',
 'O'}

In [100]:
# save them in a Dataframe 
df_2 = pd.DataFrame({
        'Token': tokens_2,
        'NER': ner_labels_2,
        'POS': pos_tags_2,
        'DEP': dep_labels_2,
        'Lemma': lemmas_2
})

In [102]:
# check frequency of medical entities 
df_1['NER'].value_counts()

NER
O             15367
B-DISEASE       156
I-DISEASE       129
B-CHEMICAL       70
I-CHEMICAL       55
Name: count, dtype: int64

In [103]:
# check frequency of general  entities 
df_2['NER'].value_counts()

NER
O           14959
I-ORG         263
I-PERSON      180
B-ORG         130
B-PERSON      108
B-GPE          50
I-DATE         39
B-DATE         32
I-GPE          14
B-LOC           1
I-LOC           1
Name: count, dtype: int64

In [104]:
# combine 2 Dataframe toghether to build a dataset 
ner_bio = [ x for x in df_1['NER'].values]
ner_gen = [ x for x in df_2['NER'].values]

ner_bio_gen = []
for ner_bio , ner_gen in zip(ner_bio,ner_gen):
    if ner_bio != 'O' and ner_gen=='O':
        ner_bio_gen.append(ner_bio)
    elif ner_bio == 'O' and ner_gen!='O':
        ner_bio_gen.append(ner_gen)
    else:
        ner_bio_gen.append(ner_gen)
# assign all entities togher in one features 
df_2['NER'] = ner_bio_gen



15777

In [106]:
df_2['NER'].value_counts()

NER
O             14583
I-ORG           263
I-PERSON        180
B-DISEASE       151
B-ORG           130
I-DISEASE       109
B-PERSON        108
B-CHEMICAL       67
B-GPE            50
I-CHEMICAL       49
I-DATE           39
B-DATE           32
I-GPE            14
B-LOC             1
I-LOC             1
Name: count, dtype: int64

In [109]:
# try to fix wrong annotations with person's name from authors from json files 

stripped_list = [ clean_text(s) for s in authors]

split_names = [name.split(' ', 1) for name in stripped_list]

split_names_ = [] 

for item in split_names:
    for sub_item in item:
        split_names_.append(sub_item)

split_names_


dictionary = {}
for word_index in range(len(split_names_)):
    if word_index%2==0:
        dictionary[split_names_[word_index]] = 'B-PERSON'
    else:
        dictionary[split_names_[word_index]] = 'I-PERSON'

array_token = df_2['Token'].values
array_ner = df_2['NER'].values


for i in range(len(array_token)):
    if array_token[i] in dictionary:
        array_ner[i] = dictionary[array_token[i]]


df_2['NER'] = array_ner



In [112]:
# Save my dataset in .csv form 
df_2.to_csv('hassen_dataset_global.csv')

In [116]:
df_2

Unnamed: 0,Token,NER,POS,DEP,Lemma
0,file,O,VERB,advcl,file
1,em,O,PRON,dative,em
2,fertility,O,NOUN,dobj,fertility
3,and,O,CCONJ,cc,and
4,sterility,O,NOUN,conj,sterility
...,...,...,...,...,...
15772,hysteroscopic,O,NOUN,amod,hysteroscopic
15773,approach,O,NOUN,pobj,approach
15774,and,O,CCONJ,cc,and
15775,operator,O,NOUN,compound,operator
