In [4]:
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from datetime import datetime


# Helper Functions

In [5]:
def parse_name(name):
    """
    Parses a string into a user name in the user name and alternative name, contained in parentheses
    """
    m = re.search(r'\(.+\)', name)
    user_name = re.sub(r'\(.+\)', '', name).strip()
    alternative_name = ''
    if m:
        text = m.group(0)
        if 'nomination' not in text and '2nd' not in text and '3rd' not in text and '4th' not in text:
            alternative_name = re.sub("\(|\)", "", text).strip()
    return (user_name, alternative_name)

def parse_votes_from_items(nested_items):
    """
    Identify the header of a support, oppose, or neutral list from nested items, and calculate 
    the number of votes based on the lengths of the lists
    """
    support_list, oppose_list, neutral_list = None, None, None
    for ni in nested_items:
        text = ni.text.strip()
        if text == 'Support' or text == 'Support:':
            support_list = ni.findNext('ol')
        elif text == 'Oppose' or text == 'Oppose:':
            oppose_list = ni.findNext('ol')
        elif text == 'Neutral' or text == 'Neutral:' or text == 'Abstain' or text == 'Abstain:':
            neutral_list = ni.findNext('ol')
    for l in [support_list, oppose_list, neutral_list]:
        # color changes in some lists lead part of the list to be children of the original list
        if l:
            for match in l.findAll('font'):
                match.replaceWithChildren()
    #exclude nested lists; each item is a single vote                        
    yes = len(support_list.find_all('li', recursive=False)) if support_list else 0
    no = len(oppose_list.find_all('li', recursive=False)) if oppose_list else 0
    neutral = len(neutral_list.find_all('li', recursive=False)) if neutral_list else 0
    return (yes, no, neutral)

def find_votes(nested_soup):
    """
    Identify votes from archived election discussion pagei. Most pages seem to have already tallied
    up the votes in the format of (#yes/#no/#neutral) in the pages, which this function tries to identify first.
    If unavailable, the function looks for comments and tallies up comments as votes.
    """
    text = nested_soup.find(text=re.compile(r'.\(\d+/\d+/\d+\).'))
    if text:
        m = re.search(r'.\((\d+)/(\d+)/(\d+)\).', text)
        yes, no, neutral = m.groups()
    else:
        yes, no, neutral = 0, 0, 0
        for nested_items in [nested_soup.find_all('p'), nested_soup.find_all("span", {"class":'mw-headline'}), 
                            nested_soup.find_all("dt")]:
            yes, no, neutral = parse_votes_from_items(nested_items)
            if not (yes == 0 and no == 0 and neutral == 0):
                break
    return (yes, no, neutral)

def parse_date(time_str):
    """
    Used to convert dates into datetime objects
    """
    parsed = None
    time_str = time_str.replace('Novmber', 'November')
    for fmt in ['%d %B %Y', '%d %B %y', '%d%B%Y', '%d %B%Y', '%d %b %Y', '%B %Y', ]:
        try:
            parsed = datetime.strptime(time_str, fmt)
        except:
            continue
    return parsed

def parse_time(time_str):
    """
    Used to convert dates with actual timestamps into datetime objects
    """

    parsed = None
    time_str = re.sub('\(UTC\)', '', time_str)
    for fmt in ['%H:%M, %d %b %Y', '%H:%M, %b %d, %Y', '%H:%M, %Y %b %d', '%H:%M, %d %B %Y', '%H:%M, %B %d, %Y', '%H:%M, %Y %B %d']:
        try:
            parsed = datetime.strptime(time_str.strip(), fmt)
        except:
            continue
    return parsed
    

In [6]:
date_re = [r'\d+:\d+, \d+ [A-Za-z]+ \d+ \(UTC\)', r'\d+:\d+, [A-Za-z]+ \d+,? \d+ \(UTC\)']
def is_self_nominate(content, link, user_name):
    """
    Determines whether an individual is self-nominated by comparing username to signature and finding key words
    in text
    """
    # assuming user name is the first h3 title
    title = content.find('h3')
    nomination = ''
    self_nominate = False
    if title.span.text == user_name or re.sub(' ', '_', title.span.text) == user_name:
        for tag in title.next_siblings:
            # newlines
            if isinstance(tag, bs4.element.NavigableString):
                continue
            if tag.name == 'ul':
                for i in tag.find_all('li'):
                    i.replaceWithChildren()
                for i in tag.find_all('ul'):
                    i.decompose()
            if 'Questions for the candidate' in tag.text or 'Support' in tag.text:
                break
            if tag.find('a'):
                for c in tag.find_all('a'):
                    # matches on cases where the user no longer exists and thus href link does not match to a specific user
                    if (c.has_attr('title') and 'User:' in c['title']):
                        name = c['title'][c['title'].find(':')+1:]
                        name = re.sub('\(page does not exist\)', '', name)
                        # user_names are derived from the link which cannot contain spaces,
                        # titles can contain spaces, and thus we need to replace titles with underscores
                        # which are used in links
                        c.string = re.sub(' ', '_', name)
                    elif (c.has_attr('title') and 'User talk:' in c['title']):
                        name = c['title'][c['title'].find(':')+1:]
                        name = re.sub('\(page does not exist\)', '', name)
                        c.string = re.sub(' ', '_', name)
            text = tag.text
            if text:
                nomination += text
    # the current method still doesn't work because the tag of the href is added post time-stamp. Can we replace the href text with its title when iterating?
    chop = nomination.find('Candidate, please indicate acceptance')
    if chop >= 0:
        nomination = nomination[:chop]
    for date in date_re:
        # allowing for 12 random characters between user name and date to deem 
        m = re.search(user_name + '[^\n]{1,12}' + date + "$", nomination)
        if m:
            self_nominate = True
            break
    if not self_nominate:
        self_nom_indicators = ['self-nominat', 'self nominat', 'nominate myself', 'present myself', 'submit myself', 'self-nom']
        nomination = nomination.lower()
        for i in self_nom_indicators:
            if nomination.find(i) > -1:
                self_nominate = True
                break
    return self_nominate

def find_comments(l):
    """
    Given a list object, iterate through all items to extract comments, their associated dates, and the user
    who left the comment
    """
    comments = []
    for i in l.find_all('li', recursive=False):
        if i.contents is None or len(i.contents) == 0:
            continue
        comment = ''   
        date = ''
        name = ''
        for l in [i.find_all('b'), i.find_all('tt'), i.find_all('font'), i.find_all('strong')]:
            for match in l:
                match.replaceWithChildren()
        for element in i:
            if isinstance(element, bs4.element.NavigableString):
                for r in date_re:
                    m = re.search(r, element)
                    if m:
                        date = m.group()
                        element = re.sub(r, '', element)
                comment += element
            elif element.name == 'a':
                # matches on cases where the user no longer exists and thus href link does not match to a specific user
                if name == '' and (element.has_attr('title') and 'User:' in element['title']):
                    name = element['title'][element['title'].find(':')+1:]
                    name = re.sub('\(page does not exist\)', '', name)
                elif name == '' and (element.has_attr('title') and 'User talk:' in element['title']):
                    name = element['title'][element['title'].find(':')+1:]
                    name = re.sub('\(page does not exist\)', '', name)
                elif not ((element.has_attr('title') and 'User talk:' in element['title'])
                          or (element.has_attr('title') and 'User:' in element['title'])):
                    comment += element.text
        comments.append([date, name, comment.strip()])
    return comments

def extract_comments(content, link):
    """
    Finds vote lists in content and extracts each comment line by line by calling find_comments
    """
    all_comments = []
    user_name = link.split('/')[-1]
    user_name = re.sub(r'_[1-9]', '', user_name)
    for list_name in ['Support', 'Oppose', 'Neutral']:
        l = content.find('span', {'id':list_name})
        if not l:
            for elem in ['dl', 'dt', 'p', 'b']:
                l = content.find(elem, text=re.compile('\s*' + list_name + ':?\s*'))
                if l: break
        
        if l:
            found_list = l.find_next('ol')
            if found_list:
                comments = find_comments(found_list)
                for c in comments:
                    all_comments.append([list_name] + c)
#    data from 2003 and 2004 are malformed
    if len(all_comments) == 0:
        comments = []
        if content.find('div', {'id':'toc'}): content.find('div', {'id':'toc'}).decompose()
        for tag in content.find('div', {'class':'mw-parser-output'}).find_all('ul', recursive=False):
            # could either be at the comment level or the nomination level
            for i in tag.find_all('li', recursive=False):
                nested = i.find_all('ul', recursive=False)
                if nested:
                    for l in nested:
                        comments.extend(find_comments(l))
            comments.extend(find_comments(tag))
        for c in comments:
            t = ''
            text = c[-1].lower()
            if 'support' in text or 'yes' in text or 'yep' in text:
                t = 'Support'
            elif 'oppose' in text:
                t = 'Oppose'
            elif 'defer' in text or 'neutral' in text or 'ambivalent' in text:
                t = 'Neutral'
            # assuming a comment that we cannot parse is not a vote
            else:
                continue
            all_comments.append([t] + c)
    if len(all_comments) == 0:
        print('Found no comments for link {}'.format(link))
    self_nominate = is_self_nominate(content, link, user_name)            
    return (self_nominate, all_comments)

def process_row(item, parse_re):
    """
    Process a row of data for bureaucrat election pages only, excluding
    any rows that result in success from unsucessful election pages to avoid
    duplicating with successful election pages. Successful and unsuccessful election pages
    supplement their own regexes for parsing the rows.
    """
    text = item.text.strip()
    # the links to the original bureaucrats; not elected
    if ',' not in text:
        return None
    name, phrase = text.split(',')
    if '(' in name:
        name = name[:name.find('(')]
    m = re.match(parse_re, phrase.strip())
    if m:
        parsed_row = [text.strip() for text in m.groups()]
        if (parsed_row[1] == 'promoted' or parsed_row[1] == 'successful'):
            return None
        try:
            link = item.find('a', href=True)
            followable_link = link['href']
            if 'https' not in followable_link:
                followable_link = "https://en.wikipedia.org" + followable_link
            nested_history = requests.get(followable_link)
            nested_soup = BeautifulSoup(nested_history.content, 'html.parser')
        except Exception as e:
            print("Link {} errored out".format(link))
            print(e)
        self_nominate, comments = extract_comments(nested_soup, followable_link)
        return(([name] + parsed_row + [self_nominate], comments))

    else:
        print('Could not process {}'.format(text))
        return None
    

In [77]:
# link = "https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Jwrosenzweig"
# soup = requests.get(link)
# content = BeautifulSoup(soup.content, 'html.parser')
# l = extract_comments(content, link)
# l

(False,
 [['Support',
   '',
   '',
   'Support. Jwrosenzweig would make a valuable addition to the group of admins if he accepts the responsibility. He gives and recieves advice graciously. All in all the model of what a good admin should be. -- Cimon Avaro on a pogostick'],
  ['Support',
   '11:47, 21 Aug 2003 (UTC)',
   'Fantasy',
   'Support.  and other of his actions/comments indicate that he would make a great addition to Wikipedia Adminship. And: I could not find nealy any thing he did without Summary. I like that\xa0;-) --']])

# Scraping Successful Elections

In [7]:
curr_year = 2021
parsed_rows = []
parsed_comments = []
for year in range(2003, curr_year+1):
    page = requests.get("https://en.wikipedia.org/wiki/Wikipedia:Successful_requests_for_adminship/"+str(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    # assumes one table per page
    table = soup.find_all('table')[0]
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if (len(cells) == 4):
            yes, no, neutral = None, None, None
            if len(cells[3].text) > 0:
                cells[3].find('span').decompose()
                yes, no, neutral = [int(i) for i in cells[3].text.strip('()').split('/')]
            user_name, alt_name = parse_name(cells[0].text)
            followable_link = cells[0].find('a')['href']
            if 'https' not in followable_link:
                followable_link = "https://en.wikipedia.org" + followable_link
            nested_history = requests.get(followable_link)
            nested_soup = BeautifulSoup(nested_history.content, 'html.parser')
            self_nominate, all_comments = extract_comments(nested_soup, followable_link)
            date = cells[1].text.strip()
            parsed_rows.append([user_name, date, alt_name, cells[2].text.strip(), yes, no, neutral, self_nominate])
            for c in all_comments:
                pc = [user_name, date] + c
                parsed_comments.append(pc)

Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Raul654
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Viajero
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Marumari
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Smith03
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Bdesham
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Phil_Bordelon
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Poor_Yorick
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Cyp
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Menchi
Found no comments for link https://en.wikipedia.org/wiki/Wikipedia:Requests_for_adminship/Cimon_avaro
Found no c

In [8]:
header = ['user_name', 'date', 'alternative_name', 'closed by', 'yes', 'no', 'neutral', 'self_nominate']
successful_elections = pd.DataFrame(parsed_rows, columns=header)
successful_elections['date'] = successful_elections['date'].apply(lambda x : parse_date(x))
successful_elections = successful_elections.sort_values(by=['date', 'user_name'])
successful_elections['num_nomination'] = successful_elections.groupby(['user_name']).cumcount()+1
successful_elections.to_csv('~/Documents/Stanford/Research/Network/processed_data/successful_elections_rfa.csv', index=False)

comment_header = ['user_name', 'date', 'type', 'comment_date', 'comment_name', 'comment']
successful_elections_comments = pd.DataFrame(parsed_comments, columns=comment_header)
successful_elections_comments['date'] = successful_elections_comments['date'].apply(lambda x : parse_date(x))
successful_elections_comments['comment_date'] = successful_elections_comments['comment_date'].apply(lambda x : parse_time(x))
successful_elections_comments = successful_elections_comments.sort_values(by=['date', 'user_name'])
successful_elections_comments.to_csv('~/Documents/Stanford/Research/Network/processed_data/successful_elections_rfa_comments.csv', index=False)

# Scraping Unsuccessful Elections

In [9]:
curr_year = 2021
parsed_rows = []
parsed_comments = []
# no closed by header as they aren't available for early unsuccessful RfA elections
# no unsuccessful election data available for 2003, table format available for 2008 forward
for year in range(2004, 2008):
    print("Processing data from {}".format(year))
    page = requests.get("https://en.wikipedia.org/wiki/Wikipedia:Unsuccessful_adminship_candidacies_(Chronological)/"+str(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    content = soup.find('div', {'mw-parser-output'})
    lists = content.find_all('ul', recursive=False)
    for l in lists:
        items = l.find_all('li')
        # each row 
        for i in items:
            try:
                link=i.find('a', href=True)
                followable_link = link['href']
                if 'https' not in followable_link:
                    followable_link = "https://en.wikipedia.org" + followable_link
                nested_history = requests.get(followable_link)
            except Exception as e:
                print("Link {} errored out".format(link))
                print(e)
            nested_soup = BeautifulSoup(nested_history.content, 'html.parser')
            yes, no, neutral = find_votes(nested_soup)
            self_nominate, all_comments = extract_comments(nested_soup, followable_link)
            text = re.sub(', \((.+)\)', '', i.text)
            m = re.match(r"(.+)[ ,]+(\d+ \w+,? ?\d+) *[-, ]+ ?([\w ]+)", text)
            if m:
                parsed_row = [text.strip().replace(',', '') for text in m.groups()]
            else:
                print("Failed parsing row {}".format(i.text))
                continue
            user_name, alternative_name = parse_name(parsed_row[0])
            date = parsed_row[1].strip()
            # closed by not parsed
            parsed_rows.append([user_name, date, alternative_name, parsed_row[2].strip(), yes, no, neutral, self_nominate])
            for c in all_comments:
                parsed_comments.append([user_name, date] + c)

Processing data from 2004
Processing data from 2005


AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
# data in tabulated format 2008 onwards 
for year in range(2008, curr_year+1):
    print("Processing data from {}".format(year))
    page = requests.get("https://en.wikipedia.org/wiki/Wikipedia:Unsuccessful_adminship_candidacies_(Chronological)/"+str(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    # assumes one table per page
    table = soup.find_all('table')[0]
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if (len(cells) == 5):
            yes, no, neutral = None, None, None
            if len(cells[4].text) > 0:
                cells[4].find('span').decompose()
                yes, no, neutral = [int(i) for i in cells[4].text.strip('()').split('/')]
            # from 2008 forward, parentheses contain the number of relection instead of alternative username
            user_name, _ = parse_name(cells[0].text)
            date = cells[1].text.strip()
            followable_link = cells[0].find('a')['href']
            if 'https' not in followable_link:
                followable_link = "https://en.wikipedia.org" + followable_link
            nested_history = requests.get(followable_link)
            nested_soup = BeautifulSoup(nested_history.content, 'html.parser')
            self_nominate, all_comments = extract_comments(nested_soup, followable_link)
            parsed_rows.append((user_name, date, '', cells[2].text.strip(), yes, no, neutral, self_nominate))
            for c in all_comments:
                parsed_comments.append([user_name, date] + c)

In [None]:
header = ['user_name', 'date', 'alternative_name', 'result', 'yes', 'no', 'neutral', 'self_nominate']
unsuccessful_elections = pd.DataFrame(parsed_rows, columns=header)
unsuccessful_elections['date'] = unsuccessful_elections['date'].apply(lambda x : parse_date(x))
unsuccessful_elections = unsuccessful_elections.sort_values(by=['date', 'user_name'])
unsuccessful_elections['num_nomination'] = unsuccessful_elections.groupby(['user_name']).cumcount()+1
unsuccessful_elections.to_csv('~/Documents/Stanford/Research/Network/processed_data/unsuccessful_elections_rfa.csv', index=False)


comment_header = ['user_name', 'date', 'type', 'comment_date', 'comment_name', 'comment']
unsuccessful_elections_comments = pd.DataFrame(parsed_comments, columns=comment_header)
unsuccessful_elections_comments['date'] = unsuccessful_elections_comments['date'].apply(lambda x : parse_date(x))
unsuccessful_elections_comments['comment_date'] = unsuccessful_elections_comments['comment_date'].apply(lambda x : parse_time(x))
unsuccessful_elections_comments = unsuccessful_elections_comments.sort_values(by=['date', 'user_name'])
unsuccessful_elections_comments.to_csv('~/Documents/Stanford/Research/Network/processed_data/unsuccessful_elections_rfa_comments.csv', index=False)

# Scraping Successful Bureaucrat Elections

In [None]:
link = "https://en.wikipedia.org/wiki/Wikipedia:Successful_bureaucratship_candidacies"
parsed_rows = []
parsed_comments = []
success_parse_re = r"closed (\d+ \w+ \d+) by (.+) at +\((\d+)/(\d+)/(\d+)\)"
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
content = soup.find('div', {'mw-parser-output'})
content.find('div', {'class':'navbox'}).decompose()
content.find('div', {'toc'}).decompose()
lists = content.find_all('ul')
for l in lists:
    items = l.find_all('li')
    for i in items:
        result = process_row(i, success_parse_re)
        if result:
            row, comments = result
            parsed_rows.append(row)
            for c in comments:
                parsed_comments.append([row[0], row[1]] + c)

In [None]:
header = ['user_name', 'date', 'closed by', 'yes', 'no', 'neutral', 'self_nominate']
successful_elections = pd.DataFrame(parsed_rows, columns=header)
successful_elections['date'] = successful_elections['date'].apply(lambda x : parse_date(x))
successful_elections = successful_elections.sort_values(by=['date', 'user_name'])
successful_elections['num_nomination'] = successful_elections.groupby(['user_name']).cumcount()+1
successful_elections.to_csv('~/Documents/Stanford/Research/Network/processed_data/successful_elections_rfb.csv', index=False)


comment_header = ['user_name', 'date', 'type', 'comment_date', 'comment_name', 'comment']
successful_elections_comments = pd.DataFrame(parsed_comments, columns=comment_header)
successful_elections_comments['comment_date'] = successful_elections_comments['comment_date'].apply(lambda x : parse_time(x))
successful_elections_comments['date'] = successful_elections_comments['date'].apply(lambda x : parse_date(x))
successful_elections_comments = successful_elections_comments.sort_values(by=['date', 'user_name'])
successful_elections_comments.to_csv('~/Documents/Stanford/Research/Network/processed_data/successful_elections_rfb_comments.csv', index=False)


# Scraping Unsuccessful Buraucrat Elections

In [None]:
link = "https://en.wikipedia.org/wiki/Wikipedia:Unsuccessful_bureaucratship_candidacies"
parsed_rows = []
parsed_comments = []
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
content = soup.find('div', {'mw-parser-output'})
content.find('div', {'class':'navbox'}).decompose()
content.find('div', {'toc'}).decompose()
lists = content.find_all('ul')
parse_re = r"(\d+ \w+ \d+) [–-] (.+) a?t? ?\((\d+)/(\d+)/(\d+)\)"
for l in lists:
    items = l.find_all('li', recursive=False)
    for i in items:
        nested_list = i.find('ul')
        if nested_list:
            nested_items = nested_list.find_all('li')
            for ni in nested_items:
                result = process_row(ni, parse_re)
                if result:
                    row, comments = result
                    parsed_rows.append(row)
                    for c in comments:
                        parsed_comments.append([row[0], row[1]] + c)
            i.find('ul').decompose()
        result = process_row(i, parse_re)
        if result:
            row, comments = result
            parsed_rows.append(row)
            for c in comments:
                parsed_comments.append([row[0], row[1]] + c)


In [None]:
header = ['user_name', 'date', 'closed by', 'yes', 'no', 'neutral', 'self_nominate']
unsuccessful_elections = pd.DataFrame(parsed_rows, columns=header)
unsuccessful_elections['date'] = unsuccessful_elections['date'].apply(lambda x : parse_date(x))
unsuccessful_elections = unsuccessful_elections.sort_values(by=['date', 'user_name'])
unsuccessful_elections['num_nomination'] = unsuccessful_elections.groupby(['user_name']).cumcount()+1
unsuccessful_elections.to_csv('~/Documents/Stanford/Research/Network/processed_data/unsuccessful_elections_rfb.csv', index=False)

comment_header = ['user_name', 'date', 'type', 'comment_date', 'comment_name', 'comment']
unsuccessful_elections_comments = pd.DataFrame(parsed_comments, columns=comment_header)
unsuccessful_elections_comments['comment_date'] = unsuccessful_elections_comments['comment_date'].apply(lambda x : parse_time(x))
unsuccessful_elections_comments['date'] = unsuccessful_elections_comments['date'].apply(lambda x : parse_date(x))
unsuccessful_elections_comments = unsuccessful_elections_comments.sort_values(by=['date', 'user_name'])
unsuccessful_elections_comments.to_csv('~/Documents/Stanford/Research/Network/processed_data/unsuccessful_elections_rfb_comments.csv', index=False)
