In [1]:
import re
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
wild_exp_journal = pd.read_csv('wild_examples_journal.csv')

In [4]:
wild_exp_journal.shape

(2080785, 2)

In [5]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category
0,{{Citation | last = Buchanan | first = Patrick...,journal
1,{{Citation | last = Edwards | first = Adam | t...,journal
2,{{Citation | last = Stuart | first = Patience ...,journal
3,{{Citation | last = Vince | first = Alan | tit...,journal
4,{{Citation | last = | first = | author-link = ...,journal


In [6]:
def get_title(citation_text):
    title_res = re.findall('title\s{0,10}=\s{0,10}([^|]+)', citation_text) 
    if len(title_res) == 0:
        if 'sports-reference' in citation_text:
            return re.findall('(C|c)ite\s{0,10}sports-reference\s{0,10}|([^|]+)', citation_text)[1][1].strip()
        else:
            article_res = re.findall('article\s{0,10}=\s{0,10}([^|]+)', citation_text)
            if len(article_res) != 0:
                return article_res[0].strip()
            return None
    return title_res[0].strip()

wild_exp_journal['title'] = wild_exp_journal['citations'].progress_apply(lambda x: get_title(x))

100%|██████████| 2080785/2080785 [00:10<00:00, 200349.00it/s]


In [7]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category,title
0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?"
1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?
2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...
3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson
4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees


In [8]:
wild_exp_journal[~wild_exp_journal['title'].notnull()].shape

(5709, 3)

In [9]:
def get_author(citation_text):    
    def check_first_last_res(first_res, last_res):
        if len(first_res) != 0 and len(last_res) != 0:
            return first_res[0].strip() + ' ' + last_res[0].strip()
        if len(first_res) != 0:
            return first_res[0].strip()
        if len(last_res) != 0:
            return last_res[0].strip()
    
    ## https://en.wikipedia.org/wiki/Template:Citation -  only these keywords are available
    first_res = re.findall('first\s{0,10}=\s{0,10}([^|]+)', citation_text)
    last_res = re.findall('last\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    author_res = re.findall('author\s{0,10}=\s{0,10}([^|]+)', citation_text)
    author1_res = re.findall('author1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    first1_res = re.findall('first1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    last1_res = re.findall('last1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    if first_res or last_res:
        return check_first_last_res(first_res, last_res)
    elif author_res:
        return author_res[0].strip()
    elif author1_res:
        return author1_res[0].strip()
    elif first1_res or last1_res:
        return check_first_last_res(first1_res, last1_res)
    else:
        return None

In [10]:
wild_exp_journal['first_author'] = wild_exp_journal['citations'].progress_apply(lambda x: get_author(x))

100%|██████████| 2080785/2080785 [00:29<00:00, 71345.61it/s]


In [11]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category,title,first_author
0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan
1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards
2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart
3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince
4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,


In [12]:
wild_exp_journal[~wild_exp_journal['first_author'].notnull()].shape

(1487541, 4)

In [13]:
wild_exp_journal['first_author'].fillna(value='No author found', inplace=True)

In [14]:
def check_whitespace_author(first_author):
    if not first_author.strip():
        return 'No author found'
    else:
        return first_author
    
wild_exp_journal['first_author'] = wild_exp_journal['first_author'].progress_apply(lambda x: check_whitespace_author(x))

100%|██████████| 2080785/2080785 [00:03<00:00, 531132.56it/s]


In [15]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category,title,first_author
0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan
1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards
2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart
3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince
4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found


In [16]:
wild_exp_journal.to_csv('wild_exp_info.csv', index=True, index_label='index')

In [21]:
wild_exp_journal.iloc[0]['citations']

'{{Citation | last = Buchanan | first = Patrick \\u2018Pat\\u2019 Joseph | url = | title = As Adelphia Goes, so Goes America? | date = February 14, 2005 | publisher = The American cause }}'