In [1]:
import pandas as pd
import requests
import json
import time
import random
import os
from semanticscholar import SemanticScholar

In [2]:
# Query paramaeters for Null class
subject_mathematics = ["number theory", "vectorfields", "stochastic", "numeric", "geometry"]

subject_biology = ["genome", "crispr", "cell therapy", "mrna", "synthetic biology", "proteomics"]

subject_chemistry = ["spectroscopy", "organometallic", "photochemistry", "chromatography", "thermodynamics"]

subject_astrophysics = ["astro", "nebular", "exoplanets", "cosmology", "dark matter", "stellar evolution"]

subject_technology = ["complexity theory", "automata theory", "graph theory", "scientific visualization", "compiler design", "software testing", "edge computing"]

subject_physics = ["nanotechnology", "quantum mechanics", "particle physics", "theoretical physics"]

subject_social = ["psychoses", "mind", "cognitive anthropology", "experimental psychology", "phenomenology", "epistemology", "philosophy of language"]

In [3]:
# To Do: Automate with topic modelling on sdg dataset

sdg_keywords = {
    "SDG1": ["poverty", "extreme poverty", "social protection", "economic rights", "minimum wage", "social safety nets", "income"],
    "SDG2": ["zero hunger", "food security", "malnutrition", "agricultural sustainability", "farming", "food deserts", "farming"],
    "SDG3": ["health", "well-being", "public health", "healthcare equity", "preventative care", "vaccination", "epidemics", "malnutrition"],
    "SDG4": ["education", "educational equity", "early childhood learning", "adult education", "digital literacy", "educational funding", "child labour"],
    "SDG5": ["gender equality", "women's rights", "female empowerment", "women empowerment", "domestic violence", "gender parity", "reproductive rights"],
    "SDG6": ["clean water", "water management", "sanitation", "hygiene", "water pollution", "drinking water access", "wastewater treatment"],
    "SDG7": ["sustainable energy", "renewable sources", "solar power", "wind energy", "energy policy", "energy conservation", "grid modernization", "energy transition"],
    "SDG8": ["economic development", "employment equality", "labor rights", "job creation", "sustainable economy", "workplace safety", "labour rights", "labour unions"],
    "SDG9": ["industrialization", "infrastructure development", "innovation research", "technology development", "sustainable industries", "manufacturing practices"],
    "SDG10": ["inequality reduction", "income disparities", "social injustice", "equitable growth", "discrimination", "minority rights"],
    "SDG11": ["urban sustainability", "smart cities", "sustainable communities", "urban planning", "public transport", "green spaces"],
    "SDG12": ["sustainable consumption", "waste management", "recycling policies", "environmental impact", "consumer behavior", "green products", "organic"],
    "SDG13": ["climate mitigation", "adaptation strategies", "emission reduction", "climate policies", "renewable energies", "environmental activism", "pollution", "environmental activism"],
    "SDG14": ["ocean conservation", "marine life", "aquatic ecosystems", "fisheries sustainability", "coral reefs", "ocean pollution", "ocean", "sea", "lake"],
    "SDG15": ["land ecosystems", "conservation strategies", "wildlife habitats", "mountain", "errosion", "erruption", "land degradation", "biodiversity preservation", "terrestrial", "earth", "dessert", "forest"],
    "SDG16": ["peaceful societies", "justice systems", "institutional integrity", "human rights", "rule of law", "anti-corruption", "corruption", "justice", "democracy", "leadership"],
    "SDG17": ["global partnership", "sustainable development cooperation", "aid effectiveness", "technology transfer", "trade agreements", "financial support", "aid"]
}

unique_sdg_keywords = list(set(keyword for keywords in sdg_keywords.values() for keyword in keywords))


In [4]:
queries = [value for key, value in globals().items() if key.startswith("subject_")]
all_topics = [item for sublist in queries for item in sublist]        

for topic in all_topics:
    
    test = [file for file in os.listdir() if file.endswith(f"{topic}.csv")]
    if len(test): 
        continue
    
    topic_rows = []
    sch = SemanticScholar()
    time.sleep(random.randrange(10))
    results = sch.search_paper(topic, fields=["title", "year", "paperId", "abstract", "fieldsOfStudy"], limit=100)
    print(topic)
    for paper in results:
        
        if not paper.abstract is None:
        
            paper_id = paper.paperId
            title = paper.title
            abstract = paper.abstract
            field_of_study = paper.fieldsOfStudy if paper.fieldsOfStudy is None else paper.fieldsOfStudy[0] if len(paper.fieldsOfStudy) == 1 else ";".join(paper.fieldsOfStudy)

            counter = 0
            for word in abstract:
                if word.lower().strip() in unique_sdg_keywords:
                    print(word)
                    counter +=1

            topic_rows.append({"id": paper_id, "title": title, "abstract": abstract, "field": field_of_study, "sdg_relation": counter})
    
    df_topic = pd.DataFrame(topic_rows)
    df_topic.to_csv(os.path.join(os.getcwd(), f"{topic}.csv"))
    
    print(f"csv file for {topic} dumped")
    

stellar evolution
csv file for stellar evolution dumped
complexity theory
csv file for complexity theory dumped
automata theory
csv file for automata theory dumped
graph theory
csv file for graph theory dumped
scientific visualization
csv file for scientific visualization dumped
compiler design
csv file for compiler design dumped
software testing
csv file for software testing dumped
edge computing
csv file for edge computing dumped
nanotechnology
csv file for nanotechnology dumped
quantum mechanics
csv file for quantum mechanics dumped
particle physics
csv file for particle physics dumped
theoretical physics
csv file for theoretical physics dumped
psychoses
csv file for psychoses dumped
mind
csv file for mind dumped
cognitive anthropology
csv file for cognitive anthropology dumped
experimental psychology
csv file for experimental psychology dumped
phenomenology
csv file for phenomenology dumped
epistemology
csv file for epistemology dumped
philosophy of language
csv file for philosophy

In [5]:

data = []

for file in os.listdir():
    
    if file.endswith(".csv"):
        temp = pd.read_csv(file)
        data.append(temp)
    

df_null = pd.concat(data)

In [7]:
df_null.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

In [8]:
df_null["sdg_label"] = [0]*len(df_null)

In [9]:
df_null.to_csv("null_labels.csv", index=False)