In [1]:
import pandas as pd
from langchain_core.documents import Document

In [2]:
# Extracting and parsing sources (stored as a list of Document objects in strings)
sources = pd.read_excel("../results/sources.xlsx")
dimensions = {
"uses_uses_sources",
"uses_data_limits_sources",
"uses_ml_approach_sources",
"uses_represents_people_sources",
"uses_biases_sources",
"uses_privacy_sources",
"uses_sensitivity_sources",
"uses_maintenance_policies_sources",
"collection_explanation_sources",
"collection_team_sources",
"collection_labour_sources",
"collection_team_demographic_sources",
"collection_target_demographics_sources",
"collection_speakers_demographics_sources",
"collection_sources_sources",
"collection_infrastructure_sources",
"annotation_explanation_sources",
"annotation_team_demographi_sources",
"annotation_infrastructure_sources",
"annotation_validation_methods_sources",
}
source_sections = []
# Loading raw data
rawData = pd.read_excel("../results/FullStudyAnalysis.xlsx", sheet_name="Raw Data")
# Joining sources to raw data
mergedData = rawData.merge(sources, on="doi")

# Creating the dimensions subsets from our overall raw data and by journals

In [3]:
# Subset by jorunal
mergedData_SD = mergedData.query("journal_x == 'SData'")
mergedData_Dbrief = mergedData.query("journal_x == 'DBrief'")
# Is using an ML approach
subsetML = mergedData.query("uses_ml_approach_bool == 'Yes'")
subsetML_SD = subsetML.query("journal_x == 'SData'")
subsetML_Dbrief = subsetML.query("journal_x == 'DBrief'")
# Use data limits
subsetLimits = mergedData.query("uses_data_limits.str.contains('Yes,*')")
subsetLimits_SD = subsetLimits.query("journal_x == 'SData'")
subsetLimits_Dbrief = subsetLimits.query("journal_x == 'DBrief'")
# Social Concerns subset
strng = "<class 'str'>"
subsetPeople = mergedData.query("uses_biases_bool != @strng")
subsetSocial = mergedData.query("uses_biases_bool == 'Yes'")
subsetSocial_SD = subsetSocial.query("journal_x == 'SData'")
subsetSocial_Dbrief = subsetSocial.query("journal_x == 'DBrief'")
# Collection team demographics
subsetCollectionTeam = mergedData.query("collection_team_demographic.str.contains('Yes,*')")
subsetCollectionTeam_SD = subsetCollectionTeam.query("journal_x == 'SData'")
subsetCollectionTeam_Dbrief = subsetCollectionTeam.query("journal_x == 'DBrief'")
# Collection target demographics
subsetCollectionTargetDemographics = mergedData.query("collection_target_demographics.str.contains('Yes,*',na=False)")
subsetCollectionTargetDemographics_SD = subsetCollectionTargetDemographics.query("journal_x == 'SData'")
subsetCollectionTargetDemographics_Dbrief = subsetCollectionTargetDemographics.query("journal_x == 'DBrief'")
# Collection speakers demographics
subsetCollectionSpeakersDemographics = mergedData.query("collection_speakers_demographics.str.contains('Yes,*',na=False)")
subsetCollectionSpeakersDemographics_SD = subsetCollectionSpeakersDemographics.query("journal_x == 'SData'")
subsetCollectionSpeakersDemographics_Dbrief = subsetCollectionSpeakersDemographics.query("journal_x == 'DBrief'")
# Annotation profile
subsetAnntationTeam = mergedData.query("annotation_team_demograaphic.str.contains('Yes,*',na=False)")
subsetAnntationTeam_SD = subsetAnntationTeam.query("journal_x == 'SData'")
subsetAnntationTeam_Dbrief = subsetAnntationTeam.query("journal_x == 'DBrief'")
# Annotation infrastructure
subsetAnntationInfrastructure = mergedData.query("annotation_infrastructure.str.contains('Yes,*',na=False)")
subsetAnntationInfrastructure_SD = subsetAnntationInfrastructure.query("journal_x == 'SData'")
subsetAnntationInfrastructure_Dbrief = subsetAnntationInfrastructure.query("journal_x == 'DBrief'")
# Annotation validation
subsetAnntationValidation = mergedData.query("annotation_validation_methods.str.contains('Yes,*',na=False)")
subsetAnntationValidation_SD = subsetAnntationValidation.query("journal_x == 'SData'")
subsetAnntationValidation_Dbrief = subsetAnntationValidation.query("journal_x == 'DBrief'")

# Extracting the sections of each chunk

In [4]:
def extract_sections(dataPapersList, dimension):
    sections = []
    for idx, papers in dataPapersList.iterrows():
        if  isinstance(papers[dimension],str):
            documents = eval(papers[dimension])
            for document in documents:
                    if dimension == "uses_uses_sources":
                        sections.insert(0,document[0].metadata['source'])
                    else:
                        sections.insert(0,document.metadata['source'])
    return sections

## Section extraction; overall and by SD and Dbrief

In [28]:
# Uses
sectionsUses = extract_sections(mergedData,"uses_uses_sources")
sectionsUses_SD = extract_sections(mergedData_SD,"uses_uses_sources")
sectionsUses_DBrief = extract_sections(mergedData_Dbrief,"uses_uses_sources")
# MLapproach
sectionsMLapproach = extract_sections(subsetML,"uses_ml_approach_sources")
sectionsMLapproach_SD = extract_sections(subsetML_SD,"uses_ml_approach_sources")
sectionsMLapproach_DBrief = extract_sections(subsetML_Dbrief,"uses_ml_approach_sources")
# Limits
sectionsLimits = extract_sections(subsetLimits,"uses_data_limits_sources")
sectionsLimits_SD = extract_sections(subsetLimits_SD,"uses_data_limits_sources")
sectionsLimits_Dbrief = extract_sections(subsetLimits_Dbrief,"uses_data_limits_sources")
# Social
sectionsSocial = extract_sections(subsetSocial,"uses_biases_sources")
sectionsSocial_SD = extract_sections(subsetSocial_SD,"uses_biases_sources")
sectionsSocial_DBrief = extract_sections(subsetSocial_Dbrief,"uses_biases_sources")
# Collection
sectionsCollection = extract_sections(mergedData,"collection_explanation_sources")
sectionsCollection_SD = extract_sections(mergedData_SD,"collection_explanation_sources")
sectionsCollection_Dbrief = extract_sections(mergedData_Dbrief,"collection_explanation_sources")
# Col Team
sectionsColTeam = extract_sections(subsetCollectionTeam,"collection_team_demographic_sources")
sectionsColTeam_SD = extract_sections(subsetCollectionTeam_SD,"collection_team_demographic_sources")
sectionsColTeam_Dbrief = extract_sections(subsetCollectionTeam_Dbrief,"collection_team_demographic_sources")
# Col target
sectionsColTarget = extract_sections(subsetCollectionTargetDemographics,"collection_target_demographics_sources")
sectionsColTarget_SD = extract_sections(subsetCollectionTargetDemographics_SD,"collection_target_demographics_sources")
sectionsColTarget_Dbrief = extract_sections(subsetCollectionTargetDemographics_Dbrief,"collection_target_demographics_sources")
# Col Sources
sectionsCollectionSources = extract_sections(mergedData,"collection_sources_sources")
sectionsCollectionSources_SD = extract_sections(mergedData_SD,"collection_sources_sources")
sectionsCollectionSources_Dbrief = extract_sections(mergedData_Dbrief,"collection_sources_sources")
# Annotation
sectionsAnnotation = extract_sections(mergedData,"annotation_explanation_sources")
sectionsAnnotation_SD = extract_sections(mergedData_SD,"annotation_explanation_sources")
sectionsAnnotation_Dbrief = extract_sections(mergedData_Dbrief,"annotation_explanation_sources")

# Annotation team
sectionsAnnTeam = extract_sections(subsetAnntationTeam,"annotation_team_demographi_sources")
sectionsAnnTeam_SD = extract_sections(subsetAnntationTeam_SD,"annotation_team_demographi_sources")
sectionsAnnTeam_Dbrief = extract_sections(subsetAnntationTeam_Dbrief,"annotation_team_demographi_sources")

# Annotation infrastructure
sectionsAnnInfr = extract_sections(subsetAnntationInfrastructure,"annotation_infrastructure_sources")
sectionsAnnInfr_SD = extract_sections(subsetAnntationInfrastructure_SD,"annotation_infrastructure_sources")
sectionsAnnInfr_Dbrief = extract_sections(subsetAnntationInfrastructure_Dbrief,"annotation_infrastructure_sources")

# Annotation Validation
sectionsAnnVal = extract_sections(subsetAnntationValidation,"annotation_validation_methods_sources")
sectionsAnnVal_SD = extract_sections(subsetAnntationValidation_SD,"annotation_validation_methods_sources")
sectionsAnnVal_Dbrief = extract_sections(subsetAnntationValidation_Dbrief,"annotation_validation_methods_sources")

# Classify by sections

In [25]:
import transformers

classifier = transformers.pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
sdataRequiredSections = [
"Title",
"Abstract",
"Background & Summary",
"Methods",
"Data Records",
"Technical Validation",
"Usage Notes",
"Code Availability",
"References",
"Author Contributions",
"Competing Interests"
]



In [26]:
def classifyLabels(sections):
    finalSections = []
    for idx , section in enumerate(sections):
        if section:
            resultClass  = classifier(section, sdataRequiredSections)
            if resultClass['scores'][0] > 0.4:
                print("changed")
                sections[idx] = resultClass['labels'][0]
            # save the result
    return finalSections

In [35]:
counts = pd.DataFrame(sectionsUses_SD).value_counts()
counts.to_excel("sectionCounts.xlsx", sheet_name="Uses")

# Uses
countsUses = pd.DataFrame(extract_sections(mergedData,"uses_uses_sources")).value_counts()
countsUses_SD = pd.DataFrame(extract_sections(mergedData_SD,"uses_uses_sources")).value_counts()
countsUses_DBrief = pd.DataFrame(extract_sections(mergedData_Dbrief,"uses_uses_sources")).value_counts()

# MLapproach
countsMLapproach = pd.DataFrame(extract_sections(subsetML,"uses_ml_approach_sources")).value_counts()
countsMLapproach_SD = pd.DataFrame(extract_sections(subsetML_SD,"uses_ml_approach_sources")).value_counts().to_excel("mlapproaches.xlsx")
countsMLapproach_Dbrief = pd.DataFrame(extract_sections(subsetML_Dbrief,"uses_ml_approach_sources")).value_counts()

# Limits
countsLimits = pd.DataFrame(extract_sections(subsetLimits,"uses_data_limits_sources")).value_counts()
countsLimits_SD = pd.DataFrame(extract_sections(subsetLimits_SD,"uses_data_limits_sources")).value_counts().to_excel("limits.xlsx")
countsLimits_Dbrief = pd.DataFrame(extract_sections(subsetLimits_Dbrief,"uses_data_limits_sources")).value_counts()

# Social
countsSocial = pd.DataFrame(extract_sections(subsetSocial,"uses_biases_sources")).value_counts()
countsSocial_SD = pd.DataFrame(extract_sections(subsetSocial_SD,"uses_biases_sources")).value_counts().to_excel("bias.xlsx")
countsSocial_Dbrief = pd.DataFrame(extract_sections(subsetSocial_Dbrief,"uses_biases_sources")).value_counts()

# Collection
countsCollection = pd.DataFrame(extract_sections(mergedData,"collection_explanation_sources")).value_counts()
countsCollection_SD = pd.DataFrame(extract_sections(mergedData_SD,"collection_explanation_sources")).value_counts().to_excel("collection.xlsx")
countsCollection_Dbrief = pd.DataFrame(extract_sections(mergedData_Dbrief,"collection_explanation_sources")).value_counts()
# Col Team
countsColTeam = pd.DataFrame(extract_sections(subsetCollectionTeam,"collection_team_demographic_sources")).value_counts()
countsColTeam_SD = pd.DataFrame(extract_sections(subsetCollectionTeam_SD,"collection_team_demographic_sources")).value_counts().to_excel("colteam.xlsx")
countsColTeam_Dbrief = pd.DataFrame(extract_sections(subsetCollectionTeam_Dbrief,"collection_team_demographic_sources")).value_counts()
# Col target
countsColTarget = pd.DataFrame(extract_sections(subsetCollectionTargetDemographics,"collection_target_demographics_sources")).value_counts()
countsColTarget_SD = pd.DataFrame(extract_sections(subsetCollectionTargetDemographics_SD,"collection_target_demographics_sources")).value_counts().to_excel("target.xlsx")
countsColTarget_Dbrief = pd.DataFrame(extract_sections(subsetCollectionTargetDemographics_Dbrief,"collection_target_demographics_sources")).value_counts()
# Col speakers
countsSpeakersTarget = pd.DataFrame(extract_sections(subsetCollectionSpeakersDemographics,"collection_speakers_demographics_sources")).value_counts()
countsSpeakersTarget_SD = pd.DataFrame(extract_sections(subsetCollectionSpeakersDemographics_SD,"collection_speakers_demographics_sources")).value_counts().to_excel("speakers.xlsx")
countsSpeakersTarget_Dbrief = pd.DataFrame(extract_sections(subsetCollectionSpeakersDemographics_Dbrief,"collection_speakers_demographics_sources")).value_counts()
# Col Sources
countsCollectionSources = pd.DataFrame(extract_sections(mergedData,"collection_sources_sources")).value_counts()
countsCollectionSources_SD = pd.DataFrame(extract_sections(mergedData_SD,"collection_sources_sources")).value_counts().to_excel("sources.xlsx")
countsCollectionSources_Dbrief = pd.DataFrame(extract_sections(mergedData_Dbrief,"collection_sources_sources")).value_counts()
# Annotation
countsAnnotation = pd.DataFrame(extract_sections(mergedData,"annotation_explanation_sources")).value_counts()
countsAnnotation_SD = pd.DataFrame(extract_sections(mergedData_SD,"annotation_explanation_sources")).value_counts().to_excel("annotation.xlsx")
countsAnnotation_Dbrief = pd.DataFrame(extract_sections(mergedData_Dbrief,"annotation_explanation_sources")).value_counts()

# Annotation team
countsAnnTeam = pd.DataFrame(extract_sections(subsetAnntationTeam,"annotation_team_demographi_sources")).value_counts()
countsAnnTeam_SD = pd.DataFrame(extract_sections(subsetAnntationTeam_SD,"annotation_team_demographi_sources")).value_counts().to_excel("anoteam.xlsx")
countsAnnTeam_Dbrief = pd.DataFrame(extract_sections(subsetAnntationTeam_Dbrief,"annotation_team_demographi_sources")).value_counts()

# Annotation infrastructure
countsAnnInfr =  pd.DataFrame(extract_sections(subsetAnntationInfrastructure,"annotation_infrastructure_sources")).value_counts()
countssectionsAnnInfr_SD =  pd.DataFrame(extract_sections(subsetAnntationInfrastructure_SD,"annotation_infrastructure_sources")).value_counts().to_excel("anoinfra.xlsx")
countsAnnInfr_Dbrief =  pd.DataFrame(extract_sections(subsetAnntationInfrastructure_Dbrief,"annotation_infrastructure_sources")).value_counts()

# Annotation Validation
countsAnnVal = pd.DataFrame(extract_sections(subsetAnntationValidation,"annotation_validation_methods_sources")).value_counts()
countsAnnVal_SD = pd.DataFrame(extract_sections(subsetAnntationValidation_SD,"annotation_validation_methods_sources")).value_counts().to_excel("anoVali.xlsx")
countsAnnVal_Dbrief = pd.DataFrame(extract_sections(subsetAnntationValidation_Dbrief,"annotation_validation_methods_sources")).value_counts()

In [27]:
sectionMLApproach_SD_clean = classifyLabels(sectionsMLapproach_SD)
#sectionsMLapproach
#sectionsMLapproach_SD
#sectionsMLapproach_DBrief

changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed


KeyboardInterrupt: 

In [None]:
countsSDUsesClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"uses_uses_sources"))).value_counts()
countsSDMLapproachClassified = pd.DataFrame(classifyLabels(extract_sections(subsetML,"uses_ml_approach_sources"))).value_counts()
countsSDLimitsClassified = pd.DataFrame(classifyLabels(extract_sections(subsetLimits,"uses_data_limits_sources"))).value_counts()
countsSDSocialClassified = pd.DataFrame(classifyLabels(extract_sections(subsetSocial,"uses_biases_sources"))).value_counts()
countsSDCollectionClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"collection_explanation_sources"))).value_counts()
countsSDColTeamClassified = pd.DataFrame(classifyLabels(extract_sections(subsetCollectionTeam,"collection_team_demographic_sources"))).value_counts()
countsSDColTargetClassified = pd.DataFrame(classifyLabels(extract_sections(subsetCollectionTargetDemographics,"collection_target_demographics_sources"))).value_counts()
countsSDCollectionSourcesClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"collection_sources_sources"))).value_counts()
countsSDAnnotationClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"annotation_explanation_sources"))).value_counts()
countsSDAnnTeamClassified = pd.DataFrame(classifyLabels(extract_sections(subsetAnntationTeam,"annotation_team_demographi_sources"))).value_counts()
countsSDAnnInfrClassified = pd.DataFrame(classifyLabels(extract_sections(subsetAnntationInfrastructure,"annotation_infrastructure_sources"))).value_counts()
countsSDAnnValClassified = pd.DataFrame(classifyLabels(extract_sections(subsetAnntationValidation,"annotation_validation_methods_sources"))).value_counts()

OLD

In [21]:


def extractSources(dataPapers):
    for idx, source in dataPapers.iterrows():
        row = {"doi":source['doi']}
        if  isinstance(source[dimension],str):
            documents = eval(source[dimension])
            for document in documents:
                    if dimension == "uses_uses_sources":
                        sections.insert(0,document[0].metadata['source'])
                    else:
                        sections.insert(0,document.metadata['source'])
            row[dimension+"_sections"] = sections
        

In [29]:
pdSection['uses_uses_sources_sections'].value_counts()

uses_uses_sources_sections
[Data Records, Data Records, Data Records, Data Records]                                                                                                                                              6
[Methods, Methods, Methods, Methods]                                                                                                                                                                  5
[Methods, Methods, Methods, Code availability]                                                                                                                                                        5
[Usage Notes, Usage Notes, Usage Notes, Usage Notes]                                                                                                                                                  5
[Code availability, Methods, Data Records, Usage Notes]                                                                                                                      

In [34]:
sources_uses = []
for index, paper in pdSection.iterrows():
    for section in paper['uses_uses_sources_sections']:
        sources_uses.append(section)
pdSourcesUses = pd.DataFrame(sources_uses)
pdSourcesUses.value_counts()

In [37]:
counts = pd.DataFrame(pdSourcesUses.value_counts())

In [38]:
from keybert import KeyBERT
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

In [None]:
source0 = unserialized_sources[0]