In [10]:
import pandas as pd
import requests
import json
import time
import random
import os
from semanticscholar import SemanticScholar

In [11]:
# Query paramaeters for Null class
subject_arts = ["baroque sculpture techniques", "surrealist cinema analysis", "postmodern architecture criticism"]

subject_economics = ["merger and acquisition trends", "theoretical econometrics", "game theory", "behavioral finance biases", "oligopoly"]

subject_vetsuisse = ["comparative anatomy of reptiles", "veterinary dental techniques", "genetics of feline diseases "]

subject_law = ["medieval legal systems", "space law", "intellectual property rights", "arbitration"]

subject_theology = ["mysticism in medieval Christianity", "comparative study of ancient religions", "philosophical theology"]

subject_economics_additional = [
    "history of board games and economic strategies", 
    "evolution of auction methods", 
    "cryptocurrencies and speculative bubbles",
    "cryptocurrencies",
    "auctions",
    "speculation"
]

subject_arts_additional = [
    "color theory in digital painting", 
    "evolution of concert piano design", 
    "history of animation techniques"
]

subject_vetsuisse_additional = [
    "pet grooming innovations", 
    "history of horse racing", 
    "wildlife photography and animal behavior",
    "animal behavior"
]

subject_law_additional = [
    "legal aspects of treasure hunting", 
    "copyright law in video games", 
    "history of legal costumes"
]

In [12]:
# To Do: Automate with topic modelling on sdg dataset

sdg_keywords = {
    "SDG1": ["poverty", "extreme poverty", "social protection", "economic rights", "minimum wage", "social safety nets", "income"],
    "SDG2": ["zero hunger", "food security", "malnutrition", "agricultural sustainability", "farming", "food deserts", "farming"],
    "SDG3": ["health", "well-being", "public health", "healthcare equity", "preventative care", "vaccination", "epidemics", "malnutrition"],
    "SDG4": ["education", "educational equity", "early childhood learning", "adult education", "digital literacy", "educational funding", "child labour"],
    "SDG5": ["gender equality", "women's rights", "female empowerment", "women empowerment", "domestic violence", "gender parity", "reproductive rights"],
    "SDG6": ["clean water", "water management", "sanitation", "hygiene", "water pollution", "drinking water access", "wastewater treatment"],
    "SDG7": ["sustainable energy", "renewable sources", "solar power", "wind energy", "energy policy", "energy conservation", "grid modernization", "energy transition"],
    "SDG8": ["economic development", "employment equality", "labor rights", "job creation", "sustainable economy", "workplace safety", "labour rights", "labour unions"],
    "SDG9": ["industrialization", "infrastructure development", "innovation research", "technology development", "sustainable industries", "manufacturing practices"],
    "SDG10": ["inequality reduction", "income disparities", "social injustice", "equitable growth", "discrimination", "minority rights"],
    "SDG11": ["urban sustainability", "smart cities", "sustainable communities", "urban planning", "public transport", "green spaces"],
    "SDG12": ["sustainable consumption", "waste management", "recycling policies", "environmental impact", "consumer behavior", "green products", "organic"],
    "SDG13": ["climate mitigation", "adaptation strategies", "emission reduction", "climate policies", "renewable energies", "environmental activism", "pollution", "environmental activism"],
    "SDG14": ["ocean conservation", "marine life", "aquatic ecosystems", "fisheries sustainability", "coral reefs", "ocean pollution", "ocean", "sea", "lake"],
    "SDG15": ["land ecosystems", "conservation strategies", "wildlife habitats", "mountain", "errosion", "erruption", "land degradation", "biodiversity preservation", "terrestrial", "earth", "dessert", "forest"],
    "SDG16": ["peaceful societies", "justice systems", "institutional integrity", "human rights", "rule of law", "anti-corruption", "corruption", "justice", "democracy", "leadership"],
    "SDG17": ["global partnership", "sustainable development cooperation", "aid effectiveness", "technology transfer", "trade agreements", "financial support", "aid"]
}

unique_sdg_keywords = list(set(keyword for keywords in sdg_keywords.values() for keyword in keywords))


In [13]:
queries = [value for key, value in globals().items() if key.startswith("subject_")]
all_topics = [item for sublist in queries for item in sublist]        

for topic in all_topics:
    
    test = [file for file in os.listdir() if file.endswith(f"{topic}.csv")]
    if len(test): 
        continue
    
    topic_rows = []
    sch = SemanticScholar()
    time.sleep(random.randrange(10))
    results = sch.search_paper(topic, fields=["title", "year", "paperId", "abstract", "fieldsOfStudy"], limit=100)
    print(topic)
    for paper in results:
        
        if not paper.abstract is None:
        
            paper_id = paper.paperId
            title = paper.title
            abstract = paper.abstract
            field_of_study = paper.fieldsOfStudy if paper.fieldsOfStudy is None else paper.fieldsOfStudy[0] if len(paper.fieldsOfStudy) == 1 else ";".join(paper.fieldsOfStudy)


            topic_rows.append({"id": paper_id, "title": title, "abstract": abstract, "field": field_of_study, "sdg_relation": 0})
    
    df_topic = pd.DataFrame(topic_rows)
    df_topic.to_csv(os.path.join(os.getcwd(), f"{topic}.csv"))
    
    print(f"csv file for {topic} dumped")
    

history of board games and economic strategies
csv file for history of board games and economic strategies dumped
evolution of auction methods
csv file for evolution of auction methods dumped
cryptocurrencies and speculative bubbles
csv file for cryptocurrencies and speculative bubbles dumped
cryptocurrencies
csv file for cryptocurrencies dumped
auctions
csv file for auctions dumped
speculation
csv file for speculation dumped
color theory in digital painting
csv file for color theory in digital painting dumped
evolution of concert piano design
csv file for evolution of concert piano design dumped
history of animation techniques
csv file for history of animation techniques dumped
pet grooming innovations
csv file for pet grooming innovations dumped
history of horse racing
csv file for history of horse racing dumped
wildlife photography and animal behavior
csv file for wildlife photography and animal behavior dumped
legal aspects of treasure hunting
csv file for legal aspects of treasure

In [15]:

data = []

for file in os.listdir():
    
    if file.endswith(".csv") and not "clean" in file and not "osdg" in file:
        print(file)
        temp = pd.read_csv(file)
        data.append(temp)
    

df_null = pd.concat(data)

oligopoly.csv
cryptocurrencies.csv
philosophical theology.csv
merger and acquisition trends.csv
game theory.csv
history of horse racing.csv
comparative anatomy of reptiles.csv
history of board games and economic strategies.csv
evolution of auction methods.csv
postmodern architecture criticism.csv
genetics of feline diseases .csv
behavioral finance biases.csv
arbitration.csv
intellectual property rights.csv
history of animation techniques.csv
surrealist cinema analysis.csv
comparative study of ancient religions.csv
speculation.csv
medieval legal systems.csv
evolution of concert piano design.csv
theoretical econometrics.csv
color theory in digital painting.csv
copyright law in video games.csv
auctions.csv
history of legal costumes.csv
null_labels.csv
cryptocurrencies and speculative bubbles.csv
veterinary dental techniques.csv
wildlife photography and animal behavior.csv
pet grooming innovations.csv
baroque sculpture techniques.csv
legal aspects of treasure hunting.csv
mysticism in medie

In [16]:
df_null.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

In [17]:
df_null["sdg_label"] = [0]*len(df_null)

In [21]:
df_null.to_csv("null_labels.csv", index=False)

In [19]:
df_null.drop_duplicates(inplace=True)

In [20]:
df_null

Unnamed: 0,id,title,abstract,field,sdg_relation,sdg_label
0,39b6855dc2bbb55681735c77eee530fc7d51d58e,The Dynamics of Retail Oligopoly,This paper examines competition between retail...,Business,0,0
1,1f3be4fb83c211f507847cc100865bc065157b41,Multimarket Oligopoly: Strategic Substitutes a...,A firm's actions in one market can change comp...,Economics,0,0
2,35ec197aa5167954e940381f975e7d7819ae0c5e,Attention Oligopoly,We model digital platforms as attention broker...,,0,0
3,f1c5730444e271a6a9d85cf27c48f11b6fc1c7c7,Oligopoly Pricing: Old Ideas and New Tools,"The ""oligopoly problem""--the question of how p...",Economics,0,0
4,ecf273d208d27dabbaa33b823c860aaa2255ddae,The Oligopoly of Academic Publishers in the Di...,The consolidation of the scientific publishing...,Biology;Medicine,0,0
...,...,...,...,...,...,...
878,fbb79c5c61547cccb8598d10c86bce79009476cd,"[Thoracic imaging. ""Not with grand innovation,...",Les deux sujets majeurs abordes au cours de la...,Medicine,0,0
879,66646e6f9ab3c5db374dee9955e821560c637d05,A case study of the development of an IT learn...,Miiko Tan and Chen Ai Yen National Institute o...,Engineering,0,0
880,3e0149d0bab8113bab9fc0217c789b51cecb3fb1,On Old Practices and New Purposes in Education.,The dilemma of maintaining academic standards ...,Political Science,0,0
881,5351aebb0e523f73aab6ab95e6eddda81d20b8bd,It's Show Time,I followed the man around a large room. He was...,Engineering,0,0
