In [3]:
# libraries
from pymongo import MongoClient
import pandas as pd
import pickle
import re
from unidecode import unidecode
from string import whitespace, punctuation
from fuzzywuzzy import process

In [4]:
# open the client
client = MongoClient()
db = client.lon_db
db.collection_names()
collection = db.lon_docs

In [5]:
# get the first ten lines of each textbody so I can find the date
doc_list = []

for doc in collection.find({'languageCode':'en'},
                          {'_id':0,
                           'publicationDate':1,'symbol':1,'body_en':1}):
        text = doc['body_en']
        lines_10 = text.split('\n')[:10]
        doc['body_en'] = lines_10
        doc_list.append(doc)

In [6]:
# let's take a look at the data.
doc_list[0]

{u'body_en': [u'',
  u'LEAGUE OF NATIONS.',
  u'',
  u'C.476.1930.X.',
  u'',
  u'Geneva, September 6th, 1930,',
  u'',
  u'CONTRIBUTIONS IN ARREARS.',
  u'',
  u'the Cuban Representative. '],
 u'publicationDate': u'1930-01-01T00:00:01Z',
 u'symbol': u'C.476.1930.X'}

In [7]:
# let's find ecerything in the lines that has a number

def get_number_lines(line_list):
    '''
    
    '''
    has_number_list = []
    
    has_number = re.compile('\d')
    
    for i in range(len(line_list)):
        line = unidecode(line_list[i])
        no_space = line.translate(None, whitespace)
        if re.search(has_number, line) != None:
            has_number_list.append((i, no_space))
        
    return has_number_list

In [50]:
# let's test this function out

test_1 = doc_list[0]['body_en']
test_2 = doc_list[300]['body_en']
test_3 = doc_list[2000]['body_en']

print get_number_lines(test_1)
print '-----------------'
print get_number_lines(test_2)
print '-----------------'
print get_number_lines(test_3)
print '-----------------'

[(4, 'Geneva,September6th,1930,')]
-----------------
[(3, '0.86.1935.VII.'), (6, 'theCouncil.Geneva,February1st,1935.')]
-----------------
[(1, 'LEAGUE0?NATIONS.'), (3, 'GcmnunlcatedtotheC.623.M.295.1933.VII.'), (5, 'MembersoftheLeague.Geneva,October31st,1933.')]
-----------------


In [13]:
def find_geneva(line_list):
    '''
    '''
    has_number = get_number_lines(line_list) #remember returns tuples
        
    start_Ge = re.compile('Ge')
    
    has_geneva = []
    
    for line in has_number:
        if re.search(start_Ge, line[1]) != None:
            has_geneva.append(line)
    
    if len(has_geneva) == 1:
        has_geneva_text = has_geneva[0][1]
        if start_Ge.match(has_geneva_text) == None:
            pass
    
    
    return has_geneva

In [14]:
# let's test this function out

test_1 = doc_list[0]['body_en']
test_2 = doc_list[300]['body_en']
test_3 = doc_list[2000]['body_en']

print find_geneva(test_1)
print '-----------------'
print find_geneva(test_2)
print '-----------------'
print find_geneva(test_3)

[(5, 'Geneva,September6th,1930,')]
-----------------
[(6, 'theCouncil.Geneva,February1st,1935.')]
-----------------
[(5, 'MembersoftheLeague.Geneva,October31st,1933.')]


In [18]:
def add_dateline_to_docs(doc_list):
    '''
    
    '''
    for doc in doc_list:
        dateline = find_geneva(doc['body_en'])
        doc['dateline'] = dateline
    return doc_list

In [66]:
# let's test this function out

test_1 = doc_list[0]['body_en']
test_2 = doc_list[300]['body_en']
test_3 = doc_list[2000]['body_en']

print add_dateline_to_docs(test_1)
print '-----------------'
print add_dateline_to_docs(test_2)
print '-----------------'
print add_dateline_to_docs(test_3)

TypeError: string indices must be integers

In [20]:
doc_list_dateline = add_dateline_to_docs(doc_list)

In [21]:
doc_list_dateline[9]['dateline']

[(7, "Geneva,September2'/th,1934*")]

In [22]:
# let's see how well I did
one_dateline = 0
multi_dateline = 0
no_datelike = 0

for doc in doc_list_dateline:
    if len(doc['dateline']) == 1:
        one_dateline += 1
    elif len(doc['dateline']) > 1:
        multi_dateline += 1
    else:
        no_datelike += 1

print one_dateline
print multi_dateline
print no_datelike

6884
49
4985


Now the thing I want to do is to label each line as dateline or non-dateline or other thing

In [23]:
print test_1

[u'', u'LEAGUE OF NATIONS.', u'', u'C.476.1930.X.', u'', u'Geneva, September 6th, 1930,', u'', u'CONTRIBUTIONS IN ARREARS.', u'', u'the Cuban Representative. ']


In [24]:
[unidecode(i) for i in test_1 if i != '']

['LEAGUE OF NATIONS.',
 'C.476.1930.X.',
 'Geneva, September 6th, 1930,',
 'CONTRIBUTIONS IN ARREARS.',
 'the Cuban Representative. ']

In [25]:
[unidecode(i) for i in test_2 if i != '']

['LEAGUE OF NATIONS',
 '0.86.1935.VII.',
 'Communicated to',
 'the Council. Geneva, February 1st, 1935.',
 'FREE CITY OF DANZIG ']

In [26]:
[unidecode(i) for i in test_3 if i != '']

['LEAGUE 0? NATIONS .',
 'Gcmnunlcated to the C. 623. M. 295.1933. VII.',
 'Council and the',
 'Members of the League. Geneva, October 31st, 1933.',
 'SAAR BASIN.',
 'Petition from the Union of German Newspaper Pub I ishers,']

In [34]:
# find the symbol line
# I think that this might be going too far because I have the symbol anyway
# it might be nice to see it in the wild

def find_symbol_line(line_list, symbol):
    '''
    
    '''
    line_list = [unidecode(i) for i in line_list]
    
    #no_space_line_list = [i.translate(None,whitespace) for i in no_empty_line_list]
    
    symbolline = process.extractOne(unidecode(symbol), line_list)
    
    if symbolline != None:
        line_number = line_list.index(symbolline[0])
        return (line_number, unidecode(symbolline[0]))
        

In [35]:
test_1

[u'',
 u'LEAGUE OF NATIONS.',
 u'',
 u'C.476.1930.X.',
 u'',
 u'Geneva, September 6th, 1930,',
 u'',
 u'CONTRIBUTIONS IN ARREARS.',
 u'',
 u'the Cuban Representative. ']

In [None]:
test_1.index()

In [36]:
# let's test this function out

test_1 = doc_list[0]['body_en']
symbol_1 = unidecode(doc_list[0]['symbol'])
test_2 = doc_list[300]['body_en']
symbol_2 = unidecode(doc_list[300]['symbol'])
test_3 = doc_list[2000]['body_en']
symbol_3 = symbol_2 = unidecode(doc_list[2000]['symbol'])

print find_symbol_line(test_1, symbol_1)
print '-----------------'
print find_symbol_line(test_2, symbol_2)
print '-----------------'
print find_symbol_line(test_3, symbol_3)

(3, 'C.476.1930.X.')
-----------------
(3, '0.86.1935.VII.')
-----------------
(3, 'Gcmnunlcated to the C. 623. M. 295.1933. VII.')




In [39]:
def add_symbolline_to_docs(doc_list):
    '''
    
    '''
    for doc in doc_list:
        line_list = doc['body_en']
        symbol = doc['symbol']
        
        symbolline = find_symbol_line(line_list, symbol)
        
        if symbolline == None:
            pass
            
        elif len(symbolline[1]) < 3:
            pass
        
        else:
            doc['symbolline'] = symbolline
    
    return doc_list

In [40]:
doc_list_symbolline = add_symbolline_to_docs(doc_list_dateline)



In [42]:
print doc_list_symbolline[0]['symbolline']
print doc_list_symbolline[3333]['symbolline']
print doc_list_symbolline[360]['symbolline']
print doc_list_symbolline[450]['symbolline']

(3, 'C.476.1930.X.')
(5, 'Official No. : C. S55. M. 226. 1931. IX. ')


KeyError: 'symbolline'

In [43]:
no_symbolline = 0
has_symbolline = 0

for doc in doc_list_symbolline:
    if 'symbolline' in doc.keys():
        has_symbolline += 1
    else:
        no_symbolline += 1

print has_symbolline
print no_symbolline
print has_symbolline + no_symbolline

11868
50
11918


In [44]:
doc_list_symbolline[0]

{u'body_en': [u'',
  u'LEAGUE OF NATIONS.',
  u'',
  u'C.476.1930.X.',
  u'',
  u'Geneva, September 6th, 1930,',
  u'',
  u'CONTRIBUTIONS IN ARREARS.',
  u'',
  u'the Cuban Representative. '],
 'dateline': [(5, 'Geneva,September6th,1930,')],
 u'publicationDate': u'1930-01-01T00:00:01Z',
 u'symbol': u'C.476.1930.X',
 'symbolline': (3, 'C.476.1930.X.')}

In [51]:
indices = 3,4
[i for j, i in enumerate(test_1) if j not in indices]

[u'',
 u'LEAGUE OF NATIONS.',
 u'',
 u'',
 u'CONTRIBUTIONS IN ARREARS.',
 u'',
 u'the Cuban Representative. ']

In [54]:
test_list = []
for i in test_1:
    test_list.append([i,'other'])

test_list

[[u'', 'other'],
 [u'LEAGUE OF NATIONS.', 'other'],
 [u'', 'other'],
 [u'', 'other'],
 [u'Geneva, September 6th, 1930,', 'other'],
 [u'', 'other'],
 [u'CONTRIBUTIONS IN ARREARS.', 'other'],
 [u'', 'other'],
 [u'the Cuban Representative. ', 'other']]

In [64]:
test_1

[u'',
 u'LEAGUE OF NATIONS.',
 u'',
 u'',
 u'Geneva, September 6th, 1930,',
 u'',
 u'CONTRIBUTIONS IN ARREARS.',
 u'',
 u'the Cuban Representative. ']

In [62]:
def label_lines(line_list, symbol, symbolline, dateline):
    
    #note that the datline is a list of tuples
    
    dateline = dateline[0] # this will create a single tuple
    
    symbol = unidecode(symbol)
    
    label_list = []
    
    rm_lines = symbolline[0], dateline[0]
    
    other_lines = [i for j, i in enumerate(line_list) if j not in rm_lines]
    
    no_empties = [i for i in other_lines if i != '']
    
    label_list.append([symbol, symbolline[1], 'symbolline'])
    
    label_list.append([symbol, dateline[1], 'dateline'])
    
    for line in no_empties:
        label_list.append([symbol, unidecode(line),'other'])
    
    return label_list

In [69]:
test_1 = doc_list[0]['body_en']
test_1

[u'',
 u'LEAGUE OF NATIONS.',
 u'',
 u'',
 u'Geneva, September 6th, 1930,',
 u'',
 u'CONTRIBUTIONS IN ARREARS.',
 u'',
 u'the Cuban Representative. ']

In [70]:
label_lines(test_1, 'C.476.1930.X.',(3, 'C.476.1930.X.'),[(4, 'Geneva,September6th,1930,')])



[['C.476.1930.X.', 'C.476.1930.X.', 'symbolline'],
 ['C.476.1930.X.', 'Geneva,September6th,1930,', 'dateline'],
 ['C.476.1930.X.', 'LEAGUE OF NATIONS.', 'other'],
 ['C.476.1930.X.', 'CONTRIBUTIONS IN ARREARS.', 'other'],
 ['C.476.1930.X.', 'the Cuban Representative. ', 'other']]

In [None]:
test_2 = doc_list[300]['body_en']

In [None]:
label_lines(test_2)

In [None]:
tagged_datline = pd.DataFrame(columns=['symbol','line','category'])