# Python

# This script can capture all target article IDs

In [None]:
# Retrieve fic ids from an AO3 search
# Will return in searched order
# Saves ids to a csv for later use e.g. to retrieve fic text

# Options:
# Only retrieve multichapter fics
# Modify search to include a list of tags
#      (e.g. you want all fics tagged either "romance" or "fluff")

from bs4 import BeautifulSoup
import re
import time
import requests
import csv
import sys
import datetime
import argparse
import os

page_empty = False
base_url = ""
url = ""
num_requested_fic = 0
num_recorded_fic = 0
csv_name = ""
multichap_only = ""
tags = []

# keep track of all processed ids to avoid repeats:
# this is separate from the temporary batch of ids
# that are written to the csv and then forgotten
seen_ids = set()

# 
# Ask the user for:
# a url of a works listed page
# e.g. 
# https://archiveofourown.org/works?utf8=%E2%9C%93&work_search%5Bsort_column%5D=word_count&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&work_search%5Bcomplete%5D=0&commit=Sort+and+Filter&tag_id=Harry+Potter+-+J*d*+K*d*+Rowling
# https://archiveofourown.org/tags/Harry%20Potter%20-%20J*d*%20K*d*%20Rowling/works?commit=Sort+and+Filter&page=2&utf8=%E2%9C%93&work_search%5Bcomplete%5D=0&work_search%5Blanguage_id%5D=&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Bsort_column%5D=word_count
# how many fics they want
# what to call the output csv
# 
# If you would like to add additional search terms (that is should contain at least one of, but not necessarily all of)
# specify these in the tag csv, one per row. 

def get_args():
    global base_url
    global url
    global csv_name
    global num_requested_fic
    global multichap_only
    global tags

    parser = argparse.ArgumentParser(description='Scrape AO3 work IDs given a search URL')
    parser.add_argument(
        'url', metavar='URL',
        help='a single URL pointing to an AO3 search page')
    parser.add_argument(
        '--out_csv', default='work_ids',
        help='csv output file name')
    parser.add_argument(
        '--header', default='',
        help='user http header')
    parser.add_argument(
        '--num_to_retrieve', default='a', 
        help='how many fic ids you want')
    parser.add_argument(
        '--multichapter_only', default='', 
        help='only retrieve ids for multichapter fics')
    parser.add_argument(
        '--tag_csv', default='',
        help='provide an optional list of tags; the retrieved fics must have one or more such tags')

    args = parser.parse_args()
    url = args.url
    csv_name = str(args.out_csv)
    
    # defaults to all
    if (str(args.num_to_retrieve) == 'a'):
        num_requested_fic = -1
    else:
        num_requested_fic = int(args.num_to_retrieve)

    multichap_only = str(args.multichapter_only)
    if multichap_only != "":
        multichap_only = True
    else:
        multichap_only = False

    tag_csv = str(args.tag_csv)
    if (tag_csv):
        with open(tag_csv, "r") as tags_f:
            tags_reader = csv.reader(tags_f)
            for row in tags_reader:
                tags.append(row[0])

    header_info = str(args.header)

    return header_info

# 
# navigate to a works listed page,
# then extract all work ids
# 
def get_ids(header_info=''):
    global page_empty
    global seen_ids

    # make the request. if we 429, try again later 
    headers = {'user-agent' : header_info}
    req = requests.get(url, headers=headers)
    while req.status_code == 429:
        # >5 second delay between requests as per AO3's terms of service
        time.sleep(10)
        req = requests.get(url, headers=headers)
        print("Request answered with Status-Code 429, retrying...")

    soup = BeautifulSoup(req.text, "lxml")

    # some responsiveness in the "UI"
    sys.stdout.write('.')
    sys.stdout.flush()
    works = soup.select("li.work.blurb.group")
    # see if we've gone too far and run out of fic: 
    if (len(works) == 0):
        page_empty = True

    # process list for new fic ids
    ids = []
    for tag in works:
        if (multichap_only):
            # FOR MULTICHAP ONLY
            chaps = tag.find('dd', class_="chapters")
            if (chaps.text != u"1/1"):
                t = tag.get('id')
                t = t[5:]
                if not t in seen_ids:
                    ids.append(t)
                    seen_ids.add(t)
        else:
            t = tag.get('id')
            t = t[5:]
            if not t in seen_ids:
                ids.append(t)
                seen_ids.add(t)
    return ids

# 
# update the url to move to the next page
# note that if you go too far, ao3 won't error, 
# but there will be no works listed
# 
def update_url_to_next_page():
    global url
    key = "page="
    start = url.find(key)

    # there is already a page indicator in the url
    if (start != -1):
        # find where in the url the page indicator starts and ends
        page_start_index = start + len(key)
        page_end_index = url.find("&", page_start_index)
        # if it's in the middle of the url
        if (page_end_index != -1):
            page = int(url[page_start_index:page_end_index]) + 1
            url = url[:page_start_index] + str(page) + url[page_end_index:]
        # if it's at the end of the url
        else:
            page = int(url[page_start_index:]) + 1
            url = url[:page_start_index] + str(page)

    # there is no page indicator, so we are on page 1
    else:
        # there are other modifiers
        if (url.find("?") != -1):
            url = url + "&page=2"
        # there an no modifiers yet
        else:
            url = url + "?page=2"


# modify the base_url to include the new tag, and save to global url
def add_tag_to_url(tag):
    global url
    key = "&work_search%5Bother_tag_names%5D="
    if (base_url.find(key)):
        start = base_url.find(key) + len(key)
        new_url = base_url[:start] + tag + "%2C" + base_url[start:]
        url = new_url
    else:
        url = base_url + "&work_search%5Bother_tag_names%5D=" + tag


# 
# after every page, write the gathered ids
# to the csv, so a crash doesn't lose everything.
# include the url where it was found,
# so an interrupted search can be restarted
# 
def write_ids_to_csv(ids):
    global num_recorded_fic
    with open(csv_name + ".csv", 'a', newline="") as csvfile:
        wr = csv.writer(csvfile, delimiter=',')
        for id in ids:
            if (not_finished()):
                wr.writerow([id, url])
                num_recorded_fic = num_recorded_fic + 1
            else:
                break

# 
# if you want everything, you're not done
# otherwise compare recorded against requested.
# recorded doesn't update until it's actually written to the csv.
# If you've gone too far and there are no more fic, end. 
# 
def not_finished():
    if (page_empty):
        return False

    if (num_requested_fic == -1):
        return True
    else:
        if (num_recorded_fic < num_requested_fic):
            return True
        else:
            return False

# 
# include a text file with the starting url,
# and the number of requested fics
# 
def make_readme():
    with open(csv_name + "_readme.txt", "w") as text_file:
        text_file.write("url: " + url + "\n" + "num_requested_fic: " + str(num_requested_fic) + "\n" + "retreived on: " + str(datetime.datetime.now()))

# reset flags to run again
# note: do not reset seen_ids
def reset():
    global page_empty
    global num_recorded_fic
    page_empty = False
    num_recorded_fic = 0

def process_for_ids(header_info=''):
    while(not_finished()):
        # 5 second delay between requests as per AO3's terms of service
        time.sleep(5)
        ids = get_ids(header_info)
        write_ids_to_csv(ids)
        update_url_to_next_page()

def load_existing_ids():
    global seen_ids

    if (os.path.exists(csv_name + ".csv")):
        print("skipping existing IDs...\n")
        with open(csv_name + ".csv", 'r') as csvfile:
            id_reader = csv.reader(csvfile)
            for row in id_reader:
                seen_ids.add(row[0])
    else:
        print("no existing file; creating new file...\n")

def main():
    header_info = get_args()
    make_readme()

    print ("loading existing file ...\n")
    load_existing_ids()

    print("processing...\n")


    if (len(tags)):
        for t in tags:
            print ("Getting tag: ", t)
            reset()
            add_tag_to_url(t)
            process_for_ids(header_info)
    else:
        process_for_ids(header_info)

    print("That's all, folks.")

main()


# Create an article kudos link by grabbing the id. For example, if the id is 10795731, the kudos link is https://archiveofourown.org/works/10795731?view_full_work=true#kudos

In [1]:
import requests
import time
import sys
from bs4 import BeautifulSoup

page_empty = False
seen_ids = set()

def get_kudos_links(url, header_info='', multichap_only=False):
    global page_empty
    global seen_ids

    # Send request, if received 429, then retry
    headers = {'user-agent' : header_info}
    req = requests.get(url, headers=headers)
    while req.status_code == 429:
        # According to AO3's terms of service, there should be at least 5 seconds between requests
        time.sleep(10)
        req = requests.get(url, headers=headers)
        print("Request answered with Status-Code 429, retrying...")

    soup = BeautifulSoup(req.text, "lxml")

    # Some UI prompts
    sys.stdout.write('.')
    sys.stdout.flush()
    works = soup.select("li.work.blurb.group")
    # Check if there are no works
    if (len(works) == 0):
        page_empty = True

    # Process the list of works, extract the kudos links of the works
    kudos_links = []
    for tag in works:
        t = tag.get('id')
        t = t[5:]
        kudos_link = f"https://archiveofourown.org/works/{t}?view_full_work=true#kudos"
        kudos_links.append(kudos_link)

    return kudos_links

# Call the function to get kudos links
url = "https://archiveofourown.org/works?work_search%5Bsort_column%5D=kudos_count&work_search%5Bother_tag_names%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&commit=Sort+and+Filter&tag_id=방탄소년단+%7C+Bangtan+Boys+%7C+BTS"
kudos_links = get_kudos_links(url)

# Print the obtained kudos links
print("Kudos Links:")
for link in kudos_links:
    print(link)


.Kudos Links:
https://archiveofourown.org/works/10795731?view_full_work=true#kudos
https://archiveofourown.org/works/5293532?view_full_work=true#kudos
https://archiveofourown.org/works/11193654?view_full_work=true#kudos
https://archiveofourown.org/works/9500051?view_full_work=true#kudos
https://archiveofourown.org/works/8416441?view_full_work=true#kudos
https://archiveofourown.org/works/10282934?view_full_work=true#kudos
https://archiveofourown.org/works/17857460?view_full_work=true#kudos
https://archiveofourown.org/works/14707098?view_full_work=true#kudos
https://archiveofourown.org/works/3476741?view_full_work=true#kudos
https://archiveofourown.org/works/30222195?view_full_work=true#kudos
https://archiveofourown.org/works/22992613?view_full_work=true#kudos
https://archiveofourown.org/works/27754993?view_full_work=true#kudos
https://archiveofourown.org/works/12930393?view_full_work=true#kudos
https://archiveofourown.org/works/11286888?view_full_work=true#kudos
https://archiveofourown.

# Bring in each kudos url to get the user information and save it in the csv file

In [None]:
#1. increase the waiting time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from getpass import getpass
import time

def login():
    # Prompt the user to enter username and password
    username = input("Enter your username: ")
    password = getpass("Enter your password: ")
    # Here you can add login validation logic, such as checking if the username and password are correct
    # If validation fails, you can raise an exception or return None
    # Here's a simple example where we just return the username and password
    return username, password

def accept_tos(driver):
    try:
        # Wait for the checkbox to be clickable
        checkbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'tos_agree')))
        checkbox.click()

        # Wait for the button to be clickable
        button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'accept_tos')))
        button.click()
        return True
    except Exception as e:
        print("Error in accepting terms of service:", e)
        return False

def get_kudos_usernames(driver, url):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'work')))
    
    # Check if there is an adult content warning
    adult_content_link = driver.find_elements(By.XPATH, '//a[text()="Yes, Continue"]')
    if adult_content_link:
        adult_content_link[0].click()
        # Wait for the page to load after clicking "Yes, Continue"
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'work')))
    
    if accept_tos(driver):
        click_count = 0
        while True:
            try:
                kudos_more_link = driver.find_element(By.CSS_SELECTOR, 'a#kudos_more_link')
                kudos_more_link.click()
                click_count += 1
                print(f"Clicked {click_count} times")
                # Wait for the new data to load
                time.sleep(5)  # Add a delay to ensure new data is loaded (not recommended, but sometimes necessary)
                WebDriverWait(driver, 10).until(EC.staleness_of(kudos_more_link))
            except Exception as e:
                print("Error:", e)
                break
        new_html = driver.page_source
        soup = BeautifulSoup(new_html, 'html.parser')
        kudos_div = soup.find(id='kudos')
        if kudos_div:
            usernames = [a['href'].split('/')[-1] for a in kudos_div.find_all('a')]
            print("Usernames:")
            for username in usernames:
                print(username)
        else:
            print("Kudos div not found.")
    else:
        print("Failed to accept terms of service.")

# Login
username, password = login()

# Provided link
url = "https://archiveofourown.org/works/45794530?view_full_work=true#kudos"

# Create a Safari WebDriver instance
driver = webdriver.Safari()

# Open the link
get_kudos_usernames(driver, url)


# SPAQUL

In [3]:
pip install SPARQLWrapper

Note: you may need to restart the kernel to use updated packages.


In [4]:
## Load libraries ##
from SPARQLWrapper import SPARQLWrapper2, JSON, CSV
sparql = SPARQLWrapper2("http://194.171.203.17:8890/sparql")

# Get all predicates in the database

In [6]:

sparql.setQuery("""
    PREFIX golem: <https://golemlab.eu/graph/>
    SELECT DISTINCT ?predicate WHERE {
      ?subject ?predicate ?object .
    }
""")

# 请求查询结果
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# 打印结果
if hasattr(results, "bindings"):
    print("Predicates:")
    for result in results.bindings:
        predicate = result["predicate"].value
        print(predicate)
else:
    print("No results found.")


Predicates:
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.openlinksw.com/schemas/virtrdf#version
http://www.openlinksw.com/schemas/virtrdf#loadAs
http://www.openlinksw.com/schemas/virtrdf#item
http://www.openlinksw.com/schemas/virtrdf#isSpecialPredicate
http://www.openlinksw.com/schemas/virtrdf#isGcResistantType
http://www.openlinksw.com/schemas/virtrdf#qmfName
http://www.openlinksw.com/schemas/virtrdf#qmfShortTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfLongTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfSqlvalTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfBoolTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfIsrefOfShortTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfIsuriOfShortTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfIsblankOfShortTmpl
http://www.openlinksw.com/schemas/virtrdf#qmfIslitOfShortTmpl
http://www.openlinksw.com/schemas/virtrdf#qmf01uriOfShortTmpl
http://www.openlinksw.com/schemas/virtrdf#qmf01blankOfShortTmpl
http://www.openlinksw.com/sc

# Query how many articles there are in each language for "Bangtan Boys" and "BTS"

In [5]:
sparql.setQuery("""
    PREFIX golem: <https://golemlab.eu/graph/>
    SELECT ?fandom ?language (COUNT(?language) AS ?languageCount) WHERE 
    {
    ?s golem:fandom ?fandom .
    ?s golem:language ?language .
    VALUES ?fandom { "Bangtan Boys" "BTS" }
    }
    GROUP BY ?fandom ?language
    ORDER BY DESC(?languageCount)
    """
            )
for result in sparql.query().bindings:
    fandom = result['fandom'].value
    language = result['language'].value
    language_count = result['languageCount'].value
    print(f"Fandom: {fandom}, Language: {language}, Count: {language_count}")

Fandom: Bangtan Boys, Language: English, Count: 159947
Fandom: BTS, Language: English, Count: 159893
Fandom: Bangtan Boys, Language: 中文-普通话 國語, Count: 4449
Fandom: BTS, Language: 中文-普通话 國語, Count: 4433
Fandom: BTS, Language: Español, Count: 2118
Fandom: Bangtan Boys, Language: Español, Count: 2117
Fandom: BTS, Language: Русский, Count: 1259
Fandom: Bangtan Boys, Language: Русский, Count: 1259
Fandom: Bangtan Boys, Language: Bahasa Indonesia, Count: 1209
Fandom: BTS, Language: Bahasa Indonesia, Count: 1209
Fandom: Bangtan Boys, Language: Português brasileiro, Count: 787
Fandom: BTS, Language: Português brasileiro, Count: 787
Fandom: Bangtan Boys, Language: Français, Count: 433
Fandom: BTS, Language: Français, Count: 433
Fandom: Bangtan Boys, Language: Italiano, Count: 238
Fandom: BTS, Language: Italiano, Count: 238
Fandom: Bangtan Boys, Language: Tiếng Việt, Count: 135
Fandom: BTS, Language: Tiếng Việt, Count: 135
Fandom: Bangtan Boys, Language: Polski, Count: 131
Fandom: BTS, Language:

# Query the total number of kudos for "Bangtan Boys" "BTS"

In [9]:

sparql = SPARQLWrapper("http://194.171.203.17:8890/sparql")

# Set SPARQL query
sparql.setQuery("""
    PREFIX golem: <https://golemlab.eu/graph/>

    SELECT ?fandom (SUM(?numberOfKudos) AS ?totalKudos)
    WHERE {
      ?s golem:fandom ?fandom .
      ?s golem:numberOfKudos ?numberOfKudos .
      VALUES ?fandom { "Bangtan Boys" "BTS" }
    }
    GROUP BY ?fandom
""")

# Set return format to JSON
sparql.setReturnFormat(JSON)

# Execute SPARQL query and parse results
results = sparql.query().convert()

# Print results
print("Kudos Counts:")
for result in results["results"]["bindings"]:
    fandom = result["fandom"]["value"]
    kudos_count = result["totalKudos"]["value"]
    print(f"Fandom: {fandom}, Kudos Count: {kudos_count}")

Kudos Counts:
Fandom: Bangtan Boys, Kudos Count: 44223010
Fandom: BTS, Kudos Count: 44218947


In [None]:
#Get all social relationships of "Bangtan Boys"

In [10]:
from SPARQLWrapper import SPARQLWrapper, JSON

# 创建一个SPARQL终端
sparql = SPARQLWrapper("http://194.171.203.17:8890/sparql")

# 设置查询字符串
sparql.setQuery("""
    PREFIX : <https://golemlab.eu/graph/>
    SELECT ?socialRelationships
    WHERE {
      ?s :fandom "Bangtan Boys" .
      ?s :socialRelationships ?socialRelationships .
    }
""")

# 设置返回格式为JSON
sparql.setReturnFormat(JSON)

# 执行查询并获取结果
results = sparql.query().convert()

# 输出结果
for result in results["results"]["bindings"]:
    print(result["socialRelationships"]["value"])


Kim Taehyung | V/Park Jimin
Hwang Hyunjin/Yang Jeongin | I.N Series: Hogwarts x K-pop blurbs [2]
Jeon Jungkook/Kim Taehyung | V, Jeon Jungkook/Kim Taehyung | V/Min Yoonji, Jeon Jungkook/Min Yoonji, Kim Namjoon | RM/Kim Seokjin | Jin, Kim Seokjin | Jin & Min Yoonji, Kim Taehyung | V/Min Yoonji, Min Yoonji & Jung Hoseok | J-Hope, Min Yoonji & Park Jimin
Jeon Jungkook/Kim Taehyung | V, Kim Namjoon | RM/Kim Seokjin | Jin Series: Love is a maze, but you is amaze! [1]
Jeon Jungkook/Jung Hoseok/Kim Namjoon/Kim Seokjin/Kim Taehyung/Min Yoongi/Park Jimin, Jeon Jungkook/Kim Taehyung | V, Jeon Jungkook/Kim Taehyung | V/Park Jimin, Jeon Jungkook/Park Jimin, Jung Hoseok | J-Hope/Kim Namjoon | RM, Jung Hoseok | J-Hope/Min Yoongi | Suga, Kim Namjoon | RM/Kim Seokjin | Jin, Kim Namjoon | RM/Min Yoongi | Suga, Kim Taehyung | V/Park Jimin, Min Yoongi | Suga/Park Jimin
Alcina Dimitrescu/Reader, Clay | Dream (Video Blogging RPF)/You, Clay | Dream/GeorgeNotFound (Video Blogging RPF)/You, GeorgeNotFound (Vi