Notebook for getting writeups from the DEC website: https://dataethicsclub.com/write_ups/write-ups.html

In [94]:
import requests

# GitHub API URL to fetch the contents of the specified directory
api_url = 'https://api.github.com/repos/very-good-science/data-ethics-club/contents/site/write_ups'

def get_md_files(api_url):
    """
    Fetches the contents of the specified directory and returns a list of markdown files
    """
    response = requests.get(api_url)
    if response.status_code == 200:
        files = response.json()
        md_files = [file for file in files if file['name'].endswith('.md')]
        return md_files
    else:
        print(f"Failed to fetch directory contents: {response.status_code}")
        return []

def read_md_file(file_url):
    """
    Reads the contents of the specified markdown file
    """
    response = requests.get(file_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch file content: {response.status_code}")
        return ""

md_files = get_md_files(api_url)
print(md_files[0].keys())
print("Number of writeup files: ", len(md_files))

dict_keys(['name', 'path', 'sha', 'size', 'url', 'html_url', 'git_url', 'download_url', 'type', '_links'])
Number of writeup files:  69


In [95]:
from datetime import datetime

doc_list = []

for md_file in md_files:

    # Read the content of the markdown files and add to a list


    # Code to filter out only writeup files (not standalone blog posts, which have a slightly different format)
    if 'writeup' not in md_file['name']:
        continue
    file_name = md_file['name']
    file_date = file_name.split('_writeup')[0]
    
    file_url = md_file['download_url']
    content = read_md_file(file_url)
    doc_list.append(content)
    #print(f"Content of {md_file['name']}:\n{content}\n")
#print(len(doc_list))


In [96]:
def file_processor(md_file):
    """
    Input a .md file as a string and return the metadata and content
    metadata includes if an item is a blogpost, the date, the author, the category and the tags
    Also get the title of the blogpost
    And the questions of the blogpost
    And finally the actual relevant content
    Stick it all in a dictionary
    """
    # Look between the first and second "---" in the file
    metadata = md_file.split("---")[1]
    # Split the metadata into a list of lines
    metadata = metadata.split("\n")
    # Remove the first and last lines as they are empty
    metadata = metadata[1:-1]
    #print(metadata)
    # Create a dictionary to store the metadata
    metadata_dict = {}
    # Iterate through the metadata lines
    for line in metadata:
        # Split the line into key and value
        key, value = line.split(": ")
        # Store the key and value in the dictionary
        metadata_dict[key] = value
        # Reformat the 'date' value to be a datetime object
        if key == "date":
            value = value.replace("1st", "1").replace("2nd", "2").replace("3rd", "3").replace("th", "").replace("Jan ", "January ").replace("Feb ", "February ").replace("Mar ", "March ").replace("Apr ", "April ").replace("May ", "May ").replace("Jun ", "June ").replace("Jul ", "July ").replace("Aug ", "August ").replace("Sep ", "September ").replace("Oct ", "October ").replace("Nov ", "November ").replace("Dec ", "December ")
            # metadata_dict[key] = datetime.strptime(value, "%B %d, %Y")
            # metadata_dict[key] = datetime.strptime(value, "%d %B %Y")
        if key == "tags":
            metadata_dict[key] = value.split(", ")
    

    
    # Look for the first "#" in the file
    title = md_file.split("#")[1]
    #print(title)
    # Need to account for when there isn't links, but basically want an if statement here
    if "[" in title:
        # Split the title into a list of lines
        title = title.replace("[", "").replace("]", "")
        # Remove the link in brackets at the end of the title
        #print(title)
        title = title.split("(")[0]
    else:
        title = title 
        
    #print("Title:", title)
    metadata_dict["title"] = title

    # Relevant lines are the ones with text, not the intro bit and not the list of attendees
    relevant_lines = md_file.split("```")[2]
    relevant_lines = relevant_lines.split("---")[0]
    relevant_lines = relevant_lines.split("<!--Please don't edit the info panel below-->")[0]
    #print(relevant_lines) 

    # Look for all the lines starting with # and split them into a list
    linebyline = relevant_lines.split("\n")
    hashlines = [line for line in linebyline if line.startswith("#")]
    # Remove any leading "#" and whitespace
    hashlines = [line.lstrip("#").strip() for line in hashlines]
    #print(hashlines)
    metadata_dict["questions"] = hashlines

    content = relevant_lines.split("\n\n")
    content = [line.replace('\n',' ').lstrip("#") for line in content]
    # Remove any of the authors or editors:
    
    # Remove urls from the content. Links look like a phrase [link](url). I just want "link"
    content = [line.split("[")[0] for line in content]
    metadata_dict["content"] = content
    # print(content)
    # print(metadata_dict)
    return metadata_dict
    
example = file_processor(doc_list[5])
print(example.keys())

dict_keys(['blogpost', 'date', 'author', 'category', 'tags', 'title', 'questions', 'content'])


In [97]:
import json
import os
# Make a pandas dataframe for all the writeups
# Columns: index number, title, date, author, category, tags, questions
import pandas as pd
MetaDF = pd.DataFrame(columns = example.keys())

# if there isn't a folder called writeups, make one
# if there is, save the dictionaries to 
if not os.path.exists('writeups'):
    os.makedirs('writeups')

index = 1
for doc in doc_list:
    dict = file_processor(doc)
    if index == 1:
        dict['title'] = "bropenscience is broken science"
    MetaDF = pd.concat([MetaDF, pd.DataFrame([dict])], ignore_index=True)
    # Save the dict to a json file
    with open(f"writeups/Writeup{index}.json", "w") as f:
        json.dump(dict, f)
    print(f"Saved {dict['title']} to {index}.json")
    index += 1

Saved bropenscience is broken science to 1.json
Saved  Dataism Is Our New God to 2.json
Saved  UK Statistics Authority: Identifying gaps, opportunities and priorities in the applied data ethics guidance landscape to 3.json
Saved  We created poverty. Algorithms won't make that go away to 4.json
Saved  Critical perspectives on Computer Vision to 5.json
Saved  'Living in the Hidden Realms of AI: The Workers Perspective' to 6.json
Saved The mathematics of crime and terrorism to 7.json
Saved  The Rise of Private Spies to 8.json
Saved  What an ancient lake in Nevada reveals about the future of tech to 9.json
Saved  “Participant” Perceptions of Twitter Research Ethics to 10.json
Saved  ESR: Ethics and Society Review of Artificial Intelligence Research to 11.json
Saved  Structural Injustice and Individual Responsibility to 12.json
Saved  Towards decolonising computational sciences  to 13.json
Saved  UK National AI Strategy: Pillar 3 - Governing AI Effectively to 14.json
Saved  Statistics, Euge

In [98]:
def string_to_datetime(date_string):
    """
    Convert a string to a datetime object
    """
    date_string = date_string.replace("1st", "1").replace("2nd", "2").replace("3rd", "3").replace("4th", "4").replace("5th", "5").replace("6th", "6").replace("7th", "7").replace("8th", "8").replace("9th", "9").replace("th", "").replace("Jan ", "January ").replace("Feb ", "February ").replace("Mar ", "March ").replace("Apr ", "April ").replace("May ", "May ").replace("Jun ", "June ").replace("Jul ", "July ").replace("Aug ", "August ").replace("Sep ", "September ").replace("Oct ", "October ").replace("Nov ", "November ").replace("Dec ", "December ")
    return datetime.strptime(date_string, "%B %d, %Y")
string_to_datetime("Mar 7th, 2021")

datetime.datetime(2021, 3, 7, 0, 0)

In [99]:
MetaDF.drop("content", axis=1, inplace=True)
# Turn the "date" column into a datetime object
MetaDF["DateTime"] = MetaDF["date"].apply(string_to_datetime)
MetaDF.head()

Unnamed: 0,blogpost,date,author,category,tags,title,questions,DateTime
0,True,"March 17, 2021",Natalie Zelenka,Write Up,[open science],bropenscience is broken science,"[We know the bro, It's bros all the way down, ...",2021-03-17
1,True,"March 31, 2021",Nina Di Cara,Write Up,"[philosophy, AGI]",Dataism Is Our New God,[],2021-03-31
2,True,"April 14, 2021","Nina Di Cara, Natalie Zelenka",Write Up,"[policy, oversight ]","UK Statistics Authority: Identifying gaps, op...","[General feedback, Notes on Annex B, Addional ...",2021-04-14
3,True,"April 28, 2021",Huw Day,Write Up,"[bias, prediction, structural injustice]",We created poverty. Algorithms won't make tha...,"[Summary, Intro, How deserving of help are we?...",2021-04-28
4,True,"May 12, 2021",Huw Day,Write Up,[standpoint theory],Critical perspectives on Computer Vision,"[The View from Nowhere, Better accounting for ...",2021-05-12


In [100]:
import os
if not os.path.exists('documents'):
    os.makedirs('documents')
MetaDF.to_csv("documents/Writeups.csv")