In [58]:
import pandas as pd

df = pd.read_excel("data/final_output/final_output2.xlsx")

In [59]:
import ast

def score_results(data):

    # Create score system
    output = pd.DataFrame(data['DOI'])

    # init count col
    output['Count'] = 0
    
    ## MNLI score threshold of 0.7
    output['Count'] += (data['BART_MNLI_Score'] >= 0.7).astype(int)

    output['Score'] = data['BART_MNLI_Score']

    ## Organization check
    target_orgs = {
        'Radboud University Nijmegen',
        'Radboud University Medical Center',
        'Radboud Institute for Molecular Life Sciences'
    }

    def to_list_safe(x):
        """Convert stringified list to Python list safely."""
        if pd.isna(x):
            return []
        return ast.literal_eval(x)

    def org_check(orgs):
        return int(any(org in target_orgs for org in orgs))

    # First author
    output['Count'] += data['First_Author_Organization'].apply(
        lambda x: org_check(to_list_safe(x))
    )
    output['First_author'] = data['First_Author_Organization'].apply(
        lambda x: org_check(to_list_safe(x)) > 0
    )

    # Last author
    output['Count'] += data['Last_Author_Organization'].apply(
        lambda x: org_check(to_list_safe(x))
    )
    output['Last_author'] = data['Last_Author_Organization'].apply(
        lambda x: org_check(to_list_safe(x)) > 0
    )

    ### Mesh
    ## Animals_used
    output['Count'] += (data['Animals_Used']).astype(int)

    output['Animals_Used_MesH'] = data['Animals_Used']

    ## In_vivo
    output['Count'] += (data['In_Vivo']).astype(int)

    output['In_Vivo_MesH'] = data['In_Vivo']

    #### GPT
    ## Animals_used
    output['Count'] += (data['animal_testing'] == 'yes').astype(int)

    output['Animals_Used_GPT'] = (data['animal_testing'] == 'yes')

    ## In_vivo
    output['Count'] += (data['in_vivo'] == 'yes').astype(int)

    output['In_Vivo_GPT'] = (data['in_vivo'] == 'yes')
    
    # Location
    output['Count'] += data['location'].str.contains('radboud|nijmegen', case=False, na=False).astype(int)
    output['Location_Radboud'] = data['location'].str.contains('radboud|nijmegen', case=False, na=False)
    output['Location'] = data['location']

    # Approving org
    output['Count'] += data['approving_organization'].str.contains('radboud|nijmegen|netherlands', case=False, na=False).astype(int)
    output['Apr_org_netherlands'] = data['approving_organization'].str.contains('radboud|nijmegen|netherlands', case=False, na=False)
    output['Approving_organization'] = data['approving_organization']

    output['Species'] = data['species']
    
    return output
    
output = score_results(df)

In [60]:
import pandas as pd
import re

def evaluate_row(data):
    if data['Count'] == 9:
        return True
    elif not data['Animals_Used_MesH'] and data['Score'] < 0.7:
        return False
    elif not data['In_Vivo_GPT'] or not data['Animals_Used_GPT']:
        return False
    elif data['Count'] == 8 and data['Location'] == 'No location mentioned':
        return True
    elif not data['Apr_org_netherlands'] and not data['Approving_organization'] == 'No approval mentioned':
        return False
    elif (not data['First_author'] and not data['Last_author'] and 
          data['Location'] == 'No location mentioned' and 
          data['Approving_organization'] == 'No approval mentioned'):
        return False
    elif (not data['Location_Radboud'] and not data.Location == 'No location mentioned'):
        return False
    elif (not data['Apr_org_netherlands'] and not data.Approving_organization == 'No approval mentioned'):
        return False
    elif data.Count == 7 and not data.Animals_Used_MesH and not data.In_Vivo_MesH:
        return True
    elif data.First_author and data.Last_author and data.Apr_org_netherlands and data.Location == 'No location mentioned':
        return True
    elif data.Count == 8 and data.Score < 0.7:
        return True
    elif data.Location_Radboud and data.Apr_org_netherlands:
        return True
    elif not data.First_author and not data.Last_author and not data.Location_Radboud:
        return False
    elif re.search(r'radboud|nijmegen', data.Approving_organization, re.IGNORECASE):
        return True
    elif data.First_author and data.Last_author and data.Location_Radboud and data.Approving_organization == 'No approval mentioned':
        return True
    elif data.First_author and data.Last_author and data.Location == 'No location mentioned' and data.Approving_organization == 'No approval mentioned':
        return True
    elif data.First_author and not data.Last_author and data.Location == 'No location mentioned' and data.Approving_organization == 'No approval mentioned':
        return True
    elif not data.First_author and data.Last_author and data.Location == 'No location mentioned' and data.Approving_organization == 'No approval mentioned':
        return False
    elif data.First_author and data.Location_Radboud:
        return True
    elif not data.First_author and not data.Last_author:
        return False
    elif not data.First_author and not data.Location_Radboud:
        return False
    elif data.Location_Radboud:
        return True
    elif data.First_author and data.Apr_org_netherlands:
        return True
    else:
        return 99  # or some default value

# Apply to your DataFrame
output['result'] = output.apply(evaluate_row, axis=1)


In [61]:
output.result.value_counts()

result
False    41355
True       882
Name: count, dtype: int64

In [67]:
import numpy as np

def modify_for_tableau(data: pd.DataFrame) -> pd.DataFrame:
    # Auteur column (vectorized conditions)
    conditions = [
        data["First_author"] & data["Last_author"],
        data["First_author"],
        data["Last_author"],
    ]
    choices = [
        "Eerste en laatste auteur",
        "Eerste auteur",
        "Laatste auteur",
    ]
    data["Auteur"] = np.select(conditions, choices, default="Geen van beide")

    # Split on semicolon, expand into lists
    data["Species"] = data["Species"].str.split(r"\s*;\s*")

    # Explode into multiple rows
    data = data.explode("Species", ignore_index=True)

    # Species mapping
    species_mapping = pd.read_excel("species_mapping.xlsx")

    # Example: assume mapping file has "Species" and "Common_Name"
    mapping_dict = species_mapping.set_index("Species")["Standardized Name"].to_dict()
    data["Species"] = data["Species"].map(mapping_dict).fillna(data["Species"])

    data.loc[data["result"] != True, "Species"] = pd.NA

    ## Left join publicaties
    publicaties = pd.read_excel('data/publicaties.xlsx')

        # Perform left join on DOI
    data = data.merge(
        publicaties[["DOI nummer", "Faculteit", "Onderzoeksinstituut", "Jaar uitgave"]],
        how="left",
        left_on="DOI",
        right_on="DOI nummer"
    )

    # Drop duplicate key column if you don’t need both
    data = data.drop(columns=["DOI nummer"])
    

    return data


output = modify_for_tableau(output)

In [46]:
output[output.result == True].Species.value_counts().to_excel('Species_mapping.xlsx')

In [70]:
df = pd.read_excel("data/final_output/Animal_classification.xlsx")

In [71]:
df.head(1)

Unnamed: 0,DOI,Score,Count,First_author,Last_author,Animals_Used_MesH,In_Vivo_MesH,Animals_Used_GPT,In_Vivo_GPT,Location_Radboud,Location,Apr_org_netherlands,Approving_organization,Species,Evaluation,Auteur,Faculteit,Onderzoeksinstituut,Jaar uitgave
0,10.1007/S13760-023-02443-3,0.350489,2,True,True,False,False,False,False,False,,False,,,False,Eerste en laatste auteur,FMW,DCMN,2024


In [75]:
len(df)

42384

In [74]:
# Count unique values
unique_count = df['DOI'].nunique()
print("Number of unique DOIs:", unique_count)

Number of unique DOIs: 34193
