In [1]:
import pandas as pd
from langchain_core.documents import Document

In [2]:
# Extracting and parsing sources (stored as a list of Document objects in strings)
sources = pd.read_excel("../results/sources.xlsx")
dimensions = {
"uses_uses_sources",
"uses_data_limits_sources",
"uses_ml_approach_sources",
"uses_represents_people_sources",
"uses_biases_sources",
"uses_privacy_sources",
"uses_sensitivity_sources",
"uses_maintenance_policies_sources",
"collection_explanation_sources",
"collection_team_sources",
"collection_labour_sources",
"collection_team_demographic_sources",
"collection_target_demographics_sources",
"collection_speakers_demographics_sources",
"collection_sources_sources",
"collection_infrastructure_sources",
"annotation_explanation_sources",
"annotation_team_demographi_sources",
"annotation_infrastructure_sources",
"annotation_validation_methods_sources",
}
source_sections = []


for idx, source in sources.iterrows():
    row = {"doi":source['doi']}
    for dimension in dimensions:
        if  isinstance(source[dimension],str):
            documents = eval(source[dimension])
            sections = []
            for document in documents:
                if dimension == "uses_uses_sources":
                    sections.insert(0,document[0].metadata['source'])
                else:
                    sections.insert(0,document.metadata['source'])
            row[dimension+"_sections"] = sections
    source_sections.append(row)
    pdSection = pd.DataFrame(source_sections)

In [3]:
# Loading raw data
rawData = pd.read_excel("../results/FullStudyAnalysis.xlsx", sheet_name="Raw Data")
# Joining sources to raw data
mergedData = rawData.merge(sources, on="doi")

# Creating the dimensions subsets from our raw data.

In [6]:
# Is using an ML approach
subsetML = mergedData.query("uses_ml_approach_bool == 'Yes'")
# Use data limits
subsetLimits = mergedData.query("uses_data_limits.str.contains('Yes,*')")
# Social Concerns subset
strng = "<class 'str'>"
subsetPeople = mergedData.query("uses_biases_bool != @strng")
subsetSocial = mergedData.query("uses_biases_bool == 'Yes'")
# Collection team demographics
subsetCollectionTeam = mergedData.query("collection_team_demographic.str.contains('Yes,*')")
# Collection target demographics
subsetCollectionTargetDemographics = mergedData.query("collection_target_demographics.str.contains('Yes,*',na=False)")
# Collection speakers demographics
subsetCollectionSpeakersDemographics = mergedData.query("collection_speakers_demographics.str.contains('Yes,*',na=False)")
# Annotation profile
subsetAnntationTeam = mergedData.query("annotation_team_demograaphic.str.contains('Yes,*',na=False)")
# Annotation infrastructure
subsetAnntationInfrastructure = mergedData.query("annotation_infrastructure.str.contains('Yes,*',na=False)")
# Annotation validation
subsetAnntationValidation = mergedData.query("annotation_validation_methods.str.contains('Yes,*',na=False)")

# Counting the sections

In [41]:
def extract_sections(dataPapersList, dimension):
    sections = []
    for idx, papers in dataPapersList.iterrows():
        if  isinstance(papers[dimension],str):
            documents = eval(papers[dimension])
            for document in documents:
                    if dimension == "uses_uses_sources":
                        sections.insert(0,document[0].metadata['source'])
                    else:
                        sections.insert(0,document.metadata['source'])
    return sections

In [50]:
countsUses = pd.DataFrame(extract_sections(mergedData,"uses_uses_sources")).value_counts()
countsMLapproach = pd.DataFrame(extract_sections(subsetML,"uses_ml_approach_sources")).value_counts()
countsLimits = pd.DataFrame(extract_sections(subsetLimits,"uses_data_limits_sources")).value_counts()
countsSocial = pd.DataFrame(extract_sections(subsetSocial,"uses_biases_sources")).value_counts()
countsCollection = pd.DataFrame(extract_sections(mergedData,"collection_explanation_sources")).value_counts()
countsColTeam = pd.DataFrame(extract_sections(subsetCollectionTeam,"collection_team_demographic_sources")).value_counts()
countsColTarget = pd.DataFrame(extract_sections(subsetCollectionTargetDemographics,"collection_target_demographics_sources")).value_counts()
countsCollectionSources = pd.DataFrame(extract_sections(mergedData,"collection_sources_sources")).value_counts()
countsAnnotation = pd.DataFrame(extract_sections(mergedData,"annotation_explanation_sources")).value_counts()
countsAnnTeam = pd.DataFrame(extract_sections(subsetAnntationTeam,"annotation_team_demographi_sources")).value_counts()
countsAnnInfr = pd.DataFrame(extract_sections(subsetAnntationInfrastructure,"annotation_infrastructure_sources")).value_counts()
countsAnnVal = pd.DataFrame(extract_sections(subsetAnntationValidation,"annotation_validation_methods_sources")).value_counts()

# SData Maching required sections

In [None]:
import transformers

classifier = transformers.pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
sdataRequiredSections = [
"Title",
"Abstract",
"Background & Summary",
"Methods",
"Data Records",
"Technical Validation",
"Usage Notes (optional)",
"Code Availability",
"References",
"Author Contributions",
"Competing Interests",
"Other"
]

# Subset by jorunal
subsetSData = mergedData.query("journal_x == 'SData'")
subsetDBrief = mergedData.query("journal_x == 'DBrief'")

In [None]:
def classifyLabels(sections):
    finalSections = []
    for section in sections:
        resultClass  = classifier(section, sdataRequiredSections)
        finalSections.append(resultClass['labels'][0])
        # save the result
    return finalSections

In [None]:
countsSDUsesClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"uses_uses_sources"))).value_counts()
countsSDMLapproachClassified = pd.DataFrame(classifyLabels(extract_sections(subsetML,"uses_ml_approach_sources"))).value_counts()
countsSDLimitsClassified = pd.DataFrame(classifyLabels(extract_sections(subsetLimits,"uses_data_limits_sources"))).value_counts()
countsSDSocialClassified = pd.DataFrame(classifyLabels(extract_sections(subsetSocial,"uses_biases_sources"))).value_counts()
countsSDCollectionClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"collection_explanation_sources"))).value_counts()
countsSDColTeamClassified = pd.DataFrame(classifyLabels(extract_sections(subsetCollectionTeam,"collection_team_demographic_sources"))).value_counts()
countsSDColTargetClassified = pd.DataFrame(classifyLabels(extract_sections(subsetCollectionTargetDemographics,"collection_target_demographics_sources"))).value_counts()
countsSDCollectionSourcesClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"collection_sources_sources"))).value_counts()
countsSDAnnotationClassified = pd.DataFrame(classifyLabels(extract_sections(mergedData,"annotation_explanation_sources"))).value_counts()
countsSDAnnTeamClassified = pd.DataFrame(classifyLabels(extract_sections(subsetAnntationTeam,"annotation_team_demographi_sources"))).value_counts()
countsSDAnnInfrClassified = pd.DataFrame(classifyLabels(extract_sections(subsetAnntationInfrastructure,"annotation_infrastructure_sources"))).value_counts()
countsSDAnnValClassified = pd.DataFrame(classifyLabels(extract_sections(subsetAnntationValidation,"annotation_validation_methods_sources"))).value_counts()

OLD

In [21]:


def extractSources(dataPapers):
    for idx, source in dataPapers.iterrows():
        row = {"doi":source['doi']}
        if  isinstance(source[dimension],str):
            documents = eval(source[dimension])
            for document in documents:
                    if dimension == "uses_uses_sources":
                        sections.insert(0,document[0].metadata['source'])
                    else:
                        sections.insert(0,document.metadata['source'])
            row[dimension+"_sections"] = sections
        

In [29]:
pdSection['uses_uses_sources_sections'].value_counts()

uses_uses_sources_sections
[Data Records, Data Records, Data Records, Data Records]                                                                                                                                              6
[Methods, Methods, Methods, Methods]                                                                                                                                                                  5
[Methods, Methods, Methods, Code availability]                                                                                                                                                        5
[Usage Notes, Usage Notes, Usage Notes, Usage Notes]                                                                                                                                                  5
[Code availability, Methods, Data Records, Usage Notes]                                                                                                                      

In [34]:
sources_uses = []
for index, paper in pdSection.iterrows():
    for section in paper['uses_uses_sources_sections']:
        sources_uses.append(section)
pdSourcesUses = pd.DataFrame(sources_uses)
pdSourcesUses.value_counts()

In [37]:
counts = pd.DataFrame(pdSourcesUses.value_counts())

In [38]:
from keybert import KeyBERT
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

In [None]:
source0 = unserialized_sources[0]