In [1]:
##### Imports ######
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import json
import re

In [9]:
# Removes XML tags from the data
def clean_data(data):
    try:
        tags = ET.fromstring(str(data))
        return ET.tostring(tags, method='text').decode("utf-8").replace('\'', '').replace('\ ', '').replace('\"','')
    except:
        return ''

# Finds all different places and removes the XML tags from them
def clean_places(places):
    placelist = []
    for i in places:
        tags = ET.fromstring(str(i))
        placelist.append(ET.tostring(tags, method='text').decode("utf-8"))
        
    return placelist

# Removes title, place and the XML tags from the text
def clean_text(text):
    for tag in text.findAll():
        tag.replaceWith('')
    
    text = text.text.replace('\n',' ').replace('\"', '').replace('\'', '').replace('\ ','')
    
    return re.sub('[\x7f]', '', text)

def switch_mon(month):
    switcher = {
        'JAN': 1,
        'FEB': 2,
        'MAR': 3,
        'APR': 4,
        'MAY': 5,
        'JUN': 6,
        'JUL': 7,
        'AUG': 8,
        'SEP': 9,
        'OCT': 10,
        'NOV' :11,
        'DEC' :12
    }
    return str(switcher.get(month))
    
def clean_year(data):
    DD = re.search('[0-9]+', str(data)).group(0)
    MM = re.search('[A-Z]+', str(data)).group(0)
    MM = switch_mon(str(MM))
    YYYY = re.search('\d{4}', str(data)).group(0)
    return MM+'/'+str(DD)+'/'+str(YYYY)

# Creates dictionary with all documents in the Reuters database, assigns doc_ID to all documents. 
def load_data():
    data_dict = {}
    data_id = 0
    
    with open('reuters.json', 'w') as f:
        for i in range(22):
            if i < 10:
                data1 = open('reuters_data/reut2-00'+ str(i) +'.sgm')
            else:
                data1 = open('reuters_data/reut2-0'+ str(i) +'.sgm')
            soup = BeautifulSoup(data1,'lxml')
            items = soup.findAll('reuters')

            for doc in items:
                ID =     {'index':{'_id':str(data_id)}}
                data =   {'date':clean_year(doc.date),
                          'topic':clean_data(doc.topics), 
                          'place':clean_places(doc.places.findAll('d')), 
                          'people':clean_data(doc.people), 
                          'orgs':clean_data(doc.orgs), 
                          'exchanges':clean_data(doc.exchanges), 
                          'companies':clean_data(doc.companies), 
                          'title':clean_data(doc.title), 
                          'text':clean_text(doc.findAll('text')[0]) 
                          }
                
                json.dump(ID, f)
                f.write('\n')
                json.dump(data, f)
                f.write('\n')

                print('Indexing document', data_id + 1, 'of 21578', end='\r')
                data_id += 1
        
    return 'done'

In [10]:
load_data()

Indexing document 21578 of 21578 21578 742 of 21578 912 of 21578 1113 of 21578 of 21578 1764 of 21578 1979 of 21578 2177 of 21578 2268 of 215782432 of 21578 2595 of 21578 2765 of 21578 3148 of 21578 3658 of 21578 of 21578 4327 of 21578 4510 of 21578 4759 of 21578 4865 of 21578 5083 of 21578 5168 of 21578 5184 of 21578 5271 of 21578 5434 of 21578 5694 of 21578 5773 of 21578 5851 of 21578 5912 of 21578 5961 of 21578 6087 of 21578 6178 of 21578 6284 of 21578 6407 of 21578 6533 of 215786788 of 215786925 of 21578 7113 of 21578 7203 of 21578 7311 of 21578 7461 of 21578 7753 of 21578 7855 of 21578 8176 of 21578 8271 of 21578 8377 of 21578 8547 of 21578 8630 of 21578 8721 of 21578 8806 of 21578 8911 of 21578 9135 of 21578 9248 of 21578 9351 of 21578 9471 of 21578 9723 of 21578 9838 of 21578 10438 of 21578 10546 of 21578 10630 of 21578 10709 of 21578 10804 of 2157810896 of 21578 11183 of 21578 11378 of 21578 11739 of 21578 12770 of 21578 12917 of 21578 13182 of 21578 13583 of 21578of 21578 1391

'done'