In [1]:
##### Imports ######
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import json
import re

In [67]:
# Removes XML tags from the data
def clean_data(data):
    try:
        tags = ET.fromstring(str(data))
        return ET.tostring(tags, method='text').decode("utf-8").replace('\'', '').replace('\ ', '').replace('\"','')
    except:
        return ''

# Finds all different places and removes the XML tags from them
def clean_places(places):
    placelist = []
    for i in places:
        tags = ET.fromstring(str(i))
        placelist.append(ET.tostring(tags, method='text').decode("utf-8"))
        
    return placelist

# Removes title, place and the XML tags from the text
def clean_text(text):
    for tag in text.findAll():
        tag.replaceWith('')
    
    text = text.text.replace('\n',' ').replace('\"', '').replace('\'', '').replace('\ ','')
    
    return re.sub('[\x7f]', '', text)

def clean_year(data):
    year = re.search('[0-9]+-[A-Z]+', str(data)).group(0)
    return str(year)

# Creates dictionary with all documents in the Reuters database, assigns doc_ID to all documents. 
def load_data():
    data_dict = {}
    data_id = 0
    
    with open('reuters.json', 'w') as f:
        for i in range(22):
            if i < 10:
                data1 = open('reuters_data/reut2-00'+ str(i) +'.sgm')
            else:
                data1 = open('reuters_data/reut2-0'+ str(i) +'.sgm')
            soup = BeautifulSoup(data1,'lxml')
            items = soup.findAll('reuters')

            for doc in items:
                ID =     {'index':{'_id':str(data_id)}}
                data =   {'date':clean_year(doc.date),
                          'topic':clean_data(doc.topics), 
                          'place':clean_places(doc.places.findAll('d')), 
                          'people':clean_data(doc.people), 
                          'orgs':clean_data(doc.orgs), 
                          'exchanges':clean_data(doc.exchanges), 
                          'companies':clean_data(doc.companies), 
                          'title':clean_data(doc.title), 
                          'text':clean_text(doc.findAll('text')[0]) 
                          }
                
                json.dump(ID, f)
                f.write('\n')
                json.dump(data, f)
                f.write('\n')

                print('Indexing document', data_id + 1, 'of 21578', end='\r')
                data_id += 1
        
    return 'done'

In [68]:
load_data()

Indexing document 21578 of 21578 21578 406 of 21578of 21578662 of 21578 808 of 21578 of 215781295 of 21578 1426 of 215781687 of 215781819 of 21578 1958 of 21578 2155 of 21578 2294 of 21578 2831 of 21578 2981 of 21578 3142 of 215783290 of 21578 3575 of 21578 3701 of 215783829 of 21578 3968 of 21578 4305 of 21578 4841 of 21578 4979 of 21578 5141 of 21578 5276 of 21578 5404 of 21578of 21578 5794 of 21578 6115 of 21578 6193 of 21578 6313 of 21578 6452 of 21578 6830 of 21578 6973 of 21578 7303 of 21578 7435 of 215787566 of 21578 7822 of 21578 7962 of 21578 8286 of 21578 8419 of 21578 8669 of 21578of 21578 of 21578 9536 of 21578 9815 of 2157810145 of 21578 10639 of 21578 10880 of 21578 11264 of 21578 11546 of 21578 11670 of 21578of 21578 of 2157812269 of 21578of 21578 13000 of 21578 13134 of 21578 13277 of 21578 13562 of 21578 13703 of 2157814164 of 21578 14315 of 21578 14885 of 21578 15372 of 21578of 21578 of 21578 15715 of 21578 15834 of 21578 16095 of 21578 16232 of 21578 16352 of 21578 1

'done'