In [1]:
import pandas as pd
import re
from datetime import datetime
import pickle
import numpy as np

import os 
import sys
from importlib import reload as irl
from parse_nominations import MyNominationHandler
from parse_ends import MyEndHandler


### Get links to Featured Article Archives
Here we take the htm file of the Featured Article Archive page and extract the links to the monthly archives. Subsequently we supply them to wiki.Specail:Export and extract the xml file. It's important  to export templates as well, otherwise the discussions are obmitted.  
* (https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log)
* (https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Archived_nominations)

In [2]:
def get_archives(filename, pattern):
    """ Retrives the links to the monthly FA-Archives"""
    
    # Extract links from Html document
    with open(filename) as file:
        links_to_archive = []
        for line in file:
            matcher = re.search(pattern, line)
            if matcher:
                links_to_archive.append(matcher.group(1))
                    
    
    # Remove links to archives from before 2005 and after 2016
    year_list = ['2003', '2004', '2017', '2018'] 
    links_to_archive = [link for year in year_list for link in links_to_archive if not year in link]
    return links_to_archive 

# sucessful nominations
pattern_featured = "(Wikipedia:Featured_article_candidates/Featured_log/.*?)\""
with open('./data/FA_archives.txt', 'w') as f:
    for archive in get_archives('data/FAArchive.htm', pattern_featured):
        f.write(archive +'\n')
        
#unsucessful nominations
pattern_candidate = "(Wikipedia:Featured_article_candidates/Archived_nominations/.*?)\""
with open('./data/FAC_archives.txt', 'w') as f:
    for archive in get_archives('data/FACArchive.htm', pattern_candidate)[1:]:
        f.write(archive +'\n')

#### Wirte the names of all articles to a file for checking later

In [3]:
FA_archives = './data/FA_all_archives.xml'
FA_articles = './data/FA_all_articles.txt'

def get_articles(in_file, out_file):
    with open(in_file, 'r') as f:
        articles = []
        for line in f:
            match =re.search("Wikipedia:Featured article candidates/(.*)/archive[\\d]", line)
            if match: #None is falsey in Python
                articles.append(match.group(1)) 
    with open(out_file, 'w') as file:
        for art in articles:
            file.write(art +'\n')

            
FAC_archives = './data/FAC_all_archives.xml'
FAC_articles = './log/FAC_all_articles.txt'
#get_articles(FA_archives, FA_articles)
get_articles(FAC_archives, FAC_articles)

### Getting the XML files
The next step is to go to (wiki/Special:Export)[https://en.wikipedia.org/wiki/Special:Export] and use the content of FA(C)_archives to export the current state, including templates of alle Featured Articles. The two resulting files were named FA.xml (for the sucessfull) and FAC.xml (for the unsucessfull ones). 

### Get the and time of nomination
MyNominationHandler is in charge of 
1. finding all candidates
2. find the point in time when they were nominated
3. find the last comment (will be used in handling double nominations)

It will produce an .csv file containing all nominations found in the .xml file and write these to ./data/ . A dictionary containing all problematic entries will be stored in ./tmp/ as a pickle object. 

In [5]:
#Parse Sucessful Nominations
nom_handler = MyNominationHandler('./data/FA_nomination.csv', './tmp/FA_prob.pkl')
nom_handler.parse('./data/FA.xml')


index 0 is out of bounds for axis 1 with size 0
Problematic entry in 5281
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 5458
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6252
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6824
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6827
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6832
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6833
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6834
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 6875
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problem

In [7]:
# Parse Unsucsessful Nominations
nom_handler = MyNominationHandler('./data/FAC_nomination.csv', './tmp/FAC_prob.pkl')
nom_handler.parse('./data/FAC.xml')


Problem parsing date in 669


In [8]:
ends_handler = MyEndHandler('./res/FACends.txt')
nom_handler.parse('./data/FAChistory.xml')


index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20241
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20417
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20635
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20637
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20639
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20640
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20692
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20697
seems like no title was found.
index 0 is out of bounds for axis 1 with size 0
Problematic entry in 20698
seems like no title was found.
index 0 is out of bounds for axis 1 with size 

In [5]:
FAC_nom = pd.read_csv('./data/FAC_nomination.csv', sep=';', index_col=0)
with open('./tmp/FAC_prob.pkl','rb') as file:

    FAC_prob = pickle.load(file)
    for i, art in enumerate(FAC_prob):
        #print(art)
        if i == 10: 
            break

df = pd.DataFrame(FAC_prob, columns=['idx', 'dates', 'text'])
df = df.set_index('idx')
# Table of Contents:
df = df.loc[~df.text.str.startswith('{{TOC'),:]
df = df.loc[~df.text.str.startswith('__FORCETOC__'),:]
df = df.loc[~df.text.str.contains('== ?[A-Za-z]{3,9} \d{4} ?=='),]


In [361]:
FA_nom = pd.read_csv('./data/FA_nomination.csv', sep =';', index_col=0)

In [6]:
df_nomination = pd.read_csv('./res/FAC_nomination.csv', sep=';', index_col=1)
df_nomination.head()
#df_nomination.loc[df_nomination.title.str.contains('Paul')]

Unnamed: 0_level_0,Unnamed: 0,title,nomination,last_comment
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
148,148,"[('John Holmes (U.S. politician)|John Holmes',...",2004-11-24 08:39:00,2005-02-03 04:17:00
149,149,"[('Neil Young',), ('Winamp',), ('Gold',), (""Ol...",2004-06-28 01:39:00,2004-11-27 21:38:00
150,150,"[('Formula One',), ('Coachella Valley',), ('No...",2004-09-24 17:45:00,2004-11-14 19:20:00


In [232]:
def my_replace(pattern, text):
    m = re.search(pattern, text)
    if m:
        return m.group(1)
    else:
        return text
    
with open('./res/nominations_problematic.pkl', 'rb') as f:
    prob = pickle.load(f)
    df = pd.DataFrame(prob, columns=['idx', 'dates', 'text'])
    df = df.set_index('idx',)
    



#pattern = "=== ?'{0,2}\[\[(.*)\]\] ?'{0,2}==="
pattern_1 = "=== ?'{0,2}\[\[(.*)\]\]"
pattern_2 = "=== {0,2}'{0,3}\[\[(.*)\]\] {0,2}?'{0,3}==="
pattern_3 = "\[{2}(.*?)\]{2}"

df.loc[df.text.isnull(), 'text'] = ''
df = df[~df.text.str.contains('#REDIRECT')]


df['text'] = df['text'].apply(lambda x: (my_replace(pattern_1, x)))
df['text'] = df['text'].apply(lambda x: (my_replace(pattern_2, x)))
# 304, 388 somehow part of the discussion were moved to a new xml element
# 3832 is an empy discussion
df.drop(axis=0, index=[304,388, 3832], inplace=True)
df['text'] = df['text'].apply(lambda x: (my_replace(pattern_3, x)))


#s = df.loc[, 'text']
df.loc[:]


Unnamed: 0_level_0,dates,text
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
279,"[2005-06-17 14:44:00, 2005-06-17 15:37:00, 200...",W. Mark Felt
483,"[2005-10-23 05:12:00, 2005-10-23 05:29:00, 200...",Giuseppe Verdi
491,"[2005-10-10 15:50:00, 2005-10-10 15:50:00, 200...",Brain
602,"[2005-12-26 02:47:00, 2005-12-26 03:03:00, 200...",Frank Zappa
604,"[2005-12-26 02:50:00, 2005-12-26 03:03:00, 200...",Ralph Bakshi
638,"[2005-12-08 20:08:00, 2005-12-08 20:40:00, 200...",Michigan State Capitol
883,"[2006-04-14 15:58:00, 2006-04-14 19:23:00, 200...",double bass
989,"[2006-05-15 00:24:00, 2006-05-15 00:33:00, 200...",Pikachu
1532,"[2006-11-26 20:12:00, 2006-11-26 21:19:00, 200...",Ivan Alexander of Bulgaria
1620,"[2007-01-07 11:11:00, 2007-01-07 12:17:00, 200...",Economy of the People's Republic of China


In [231]:
format_dict = {
            "=== ?'{0,2}\[\[(.*)\]\] ?'{0,2}===": (None,),
            "([\d]{2}:[\d]{2}, [A-Za-z]{3} [\d]{1,2}, [\d]{4}) \(UTC\)": (datetime.strptime, '%H:%M, %b %d, %Y'),
            "([\d]{2}:[\d]{2}, [\d]{1,2} [A-Za-z]{3} [\d]{4}) \(UTC\)": (datetime.strptime, '%H:%M, %d %b %Y'),
            "([\d]{2}:[\d]{2}, [\d]{1,2} [A-Za-z]{4,9} [\d]{4}) \(UTC\)": (datetime.strptime, '%H:%M, %d %B %Y'),
            "([\d]{2}:[\d]{2}, [A-Za-z]{4,8} [\d]{1,2}, [\d]{4}) \(UTC\)": (datetime.strptime, '%H:%M, %B %d, %Y'),
            "([A-Z][a-z]{3,8} [\d]{1,2}, [\d]{4} [\d]{2}:[\d]{2}) \(UTC\)": (datetime.strptime, '%B %d, %Y %H:%M'),
        }

s= "is size. Mark1 02:26, 22 September 2005 (UTC) Where sho"
match_and_format(s, format_dict, False, False, False)


array([datetime.datetime(2005, 9, 22, 2, 26)], dtype=object)

In [230]:
df = pd.read_csv('./res/FAC_nomination.csv', sep = ';')


9