In [51]:
# libraries
from pymongo import MongoClient
import pandas as pd
import pickle
import re
from unidecode import unidecode
from string import whitespace, punctuation
from fuzzywuzzy import process

In [2]:
# open the client
client = MongoClient()
db = client.lon_db
db.collection_names()
collection = db.lon_docs

In [3]:
# get the first ten lines of each textbody so I can find the date
doc_list = []

for doc in collection.find({'languageCode':'en'},
                          {'_id':0,
                           'publicationDate':1,'symbol':1,'body_en':1}):
        text = doc['body_en']
        lines_10 = text.split('\n')[:10]
        doc['body_en'] = lines_10
        doc_list.append(doc)

In [6]:
# let's take a look at the data.
doc_list[0]

{u'body_en': [u'',
  u'LEAGUE OF NATIONS.',
  u'',
  u'C.476.1930.X.',
  u'',
  u'Geneva, September 6th, 1930,',
  u'',
  u'CONTRIBUTIONS IN ARREARS.',
  u'',
  u'the Cuban Representative. '],
 u'publicationDate': u'1930-01-01T00:00:01Z',
 u'symbol': u'C.476.1930.X'}

In [36]:
# let's find ecerything in the lines that has a number

def get_number_lines(line_list):
    '''
    
    '''
    has_number_list = []
    
    has_number = re.compile('\d')
    
    for i in range(len(line_list)):
        line = unidecode(line_list[i])
        no_space = line.translate(None, whitespace)
        if re.search(has_number, line) != None:
            has_number_list.append((i, no_space))
        
    return has_number_list

In [24]:
# let's test this function out

test_1 = doc_list[0]['body_en']
test_2 = doc_list[300]['body_en']
test_3 = doc_list[2000]['body_en']

print get_number_lines(test_1)
print '-----------------'
print get_number_lines(test_2)
print '-----------------'
print get_number_lines(test_3)
print '-----------------'


LEAGUEOFNATIONS.

C.476.1930.X.

Geneva,September6th,1930,

CONTRIBUTIONSINARREARS.

theCubanRepresentative.
[(3, 'C.476.1930.X.'), (5, 'Geneva,September6th,1930,')]
-----------------

LEAGUEOFNATIONS

0.86.1935.VII.
Communicatedto

theCouncil.Geneva,February1st,1935.

FREECITYOFDANZIG

[(3, '0.86.1935.VII.'), (6, 'theCouncil.Geneva,February1st,1935.')]
-----------------

LEAGUE0?NATIONS.

GcmnunlcatedtotheC.623.M.295.1933.VII.
Councilandthe
MembersoftheLeague.Geneva,October31st,1933.

SAARBASIN.

PetitionfromtheUnionofGermanNewspaperPubIishers,
[(3, 'GcmnunlcatedtotheC.623.M.295.1933.VII.'), (5, 'MembersoftheLeague.Geneva,October31st,1933.')]
-----------------


In [29]:
def find_geneva(line_list):
    '''
    '''
    has_number = get_number_lines(line_list) #remember returns tuples
        
    start_Ge = re.compile('Ge')
    
    has_geneva = []
    
    for line in has_number:
        if re.search(start_Ge, line[1]) != None:
            has_geneva.append(line)
    
    if len(has_geneva) == 1:
        has_geneva_text = has_geneva[0][1]
        if start_Ge.match(has_geneva_text) == None:
            
    
    return has_geneva

In [30]:
# let's test this function out

test_1 = doc_list[0]['body_en']
test_2 = doc_list[300]['body_en']
test_3 = doc_list[2000]['body_en']

print find_geneva(test_1)
print '-----------------'
print find_geneva(test_2)
print '-----------------'
print find_geneva(test_3)


LEAGUEOFNATIONS.

C.476.1930.X.

Geneva,September6th,1930,

CONTRIBUTIONSINARREARS.

theCubanRepresentative.
[(5, 'Geneva,September6th,1930,')]
-----------------

LEAGUEOFNATIONS

0.86.1935.VII.
Communicatedto

theCouncil.Geneva,February1st,1935.

FREECITYOFDANZIG

[(6, 'theCouncil.Geneva,February1st,1935.')]
-----------------

LEAGUE0?NATIONS.

GcmnunlcatedtotheC.623.M.295.1933.VII.
Councilandthe
MembersoftheLeague.Geneva,October31st,1933.

SAARBASIN.

PetitionfromtheUnionofGermanNewspaperPubIishers,
[(5, 'MembersoftheLeague.Geneva,October31st,1933.')]


In [43]:
def add_dateline_to_docs(doc_list):
    '''
    
    '''
    for doc in doc_list:
        dateline = find_geneva(doc['body_en'])
        doc['dateline'] = dateline
    return doc_list

In [35]:
# let's test this function out

test_1 = doc_list[0]['body_en']
test_2 = doc_list[300]['body_en']
test_3 = doc_list[2000]['body_en']

print find_dateline(test_1)
print '-----------------'
print find_dateline(test_2)
print '-----------------'
print find_dateline(test_3)


LEAGUEOFNATIONS.

C.476.1930.X.

Geneva,September6th,1930,

CONTRIBUTIONSINARREARS.

theCubanRepresentative.




TypeError: ord() expected string of length 1, but int found

In [44]:
doc_list_dateline = add_dateline_to_docs(doc_list)

In [50]:
doc_list_dateline[9]['dateline']

[(7, "Geneva,September2'/th,1934*")]

In [48]:
# let's see how well I did
one_dateline = 0
multi_dateline = 0
no_datelike = 0

for doc in doc_list_dateline:
    if len(doc['dateline']) == 1:
        one_dateline += 1
    elif len(doc['dateline']) > 1:
        multi_dateline += 1
    else:
        no_datelike += 1

print one_dateline
print multi_dateline
print no_datelike

6884
49
4985


Now the thing I want to do is to label each line as dateline or non-dateline or other thing

In [52]:
print test_1

[u'', u'LEAGUE OF NATIONS.', u'', u'C.476.1930.X.', u'', u'Geneva, September 6th, 1930,', u'', u'CONTRIBUTIONS IN ARREARS.', u'', u'the Cuban Representative. ']


In [53]:
[unidecode(i) for i in test_1 if i != '']

['LEAGUE OF NATIONS.',
 'C.476.1930.X.',
 'Geneva, September 6th, 1930,',
 'CONTRIBUTIONS IN ARREARS.',
 'the Cuban Representative. ']

In [54]:
[unidecode(i) for i in test_2 if i != '']

['LEAGUE OF NATIONS',
 '0.86.1935.VII.',
 'Communicated to',
 'the Council. Geneva, February 1st, 1935.',
 'FREE CITY OF DANZIG ']

In [55]:
[unidecode(i) for i in test_3 if i != '']

['LEAGUE 0? NATIONS .',
 'Gcmnunlcated to the C. 623. M. 295.1933. VII.',
 'Council and the',
 'Members of the League. Geneva, October 31st, 1933.',
 'SAAR BASIN.',
 'Petition from the Union of German Newspaper Pub I ishers,']

In [102]:
# find the symbol line
# I think that this might be going too far because I have the symbol anyway
# it might be nice to see it in the wild

def find_symbol_line(line_list, symbol):
    '''
    
    '''
    #no_empty_line_list = [unidecode(i) for i in line_list if i != '']
    
    #no_space_line_list = [i.translate(None,whitespace) for i in no_empty_line_list]
    
    symbol_line = process.extractOne(unidecode(symbol), no_empty_line_list)
    
    line_line = process.extractOne(unidecode(symbol), )
    
    line_number = line_list.ind
    return symbol_line
        

In [99]:
# let's test this function out

test_1 = doc_list[0]['body_en']
symbol_1 = unidecode(doc_list[0]['symbol'])
test_2 = doc_list[300]['body_en']
symbol_2 = unidecode(doc_list[300]['symbol'])
test_3 = doc_list[2000]['body_en']
symbol_3 = symbol_2 = unidecode(doc_list[2000]['symbol'])

print find_symbol_line(test_1, symbol_1)
print '-----------------'
print find_symbol_line(test_2, symbol_2)
print '-----------------'
print find_symbol_line(test_3, symbol_3)

('C.476.1930.X.', 100)
-----------------
('0.86.1935.VII.', 86)
-----------------
('Gcmnunlcated to the C. 623. M. 295.1933. VII.', 86)




In [100]:
def add_symbolline_to_docs(doc_list):
    '''
    
    '''
    for doc in doc_list:
        line_list = doc['body_en']
        symbol = doc['symbol']
        
        symbolline = find_symbol_line(line_list, symbol)
        
        if symbolline == None:
            pass
            
        line_line = process.extractOne(symbolline[0], line_list) 

        line_number = line_list.index(line_line[0]) #remember this is tuple
        
        
        if len(symbolline[0]) < 3:
            pass
        else:
            doc['symbolline'] = (line_number, symbolline[0])
    
    return doc_list

In [101]:
doc_list_symbolline = add_symbolline_to_docs(doc_list_dateline)

TypeError: 'NoneType' object has no attribute '__getitem__'

In [None]:
print doc_list_symbolline[0]['symbolline']
print doc_list_symbolline[3333]['symbolline']
print doc_list_symbolline[360]['symbolline']
print doc_list_symbolline[450]['symbolline']

In [70]:
no_symbolline = 0
has_symbolline = 0

for doc in doc_list_symbolline:
    if 'symbolline' in doc.keys():
        has_symbolline += 1
    else:
        no_symbolline += 1

print has_symbolline
print no_symbolline
print has_symbolline + no_symbolline

11910
8
11918


In [71]:
doc_list_symbolline[0]

{u'body_en': [u'',
  u'LEAGUE OF NATIONS.',
  u'',
  u'C.476.1930.X.',
  u'',
  u'Geneva, September 6th, 1930,',
  u'',
  u'CONTRIBUTIONS IN ARREARS.',
  u'',
  u'the Cuban Representative. '],
 'dateline': [(5, 'Geneva,September6th,1930,')],
 u'publicationDate': u'1930-01-01T00:00:01Z',
 u'symbol': u'C.476.1930.X',
 'symbolline': 'C.476.1930.X.'}

In [None]:
tagged_datline = pd.DataFrame(columns=['symbol','line','category'])