In [1]:
##### Imports ######
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import json
import re

In [2]:
# Removes XML tags from the data
def clean_data(data):
    try:
        tags = ET.fromstring(str(data))
        return ET.tostring(tags, method='text').decode("utf-8").replace('\'', '').replace('\ ', '').replace('\"','')
    except:
        return ''

# Finds all different places and removes the XML tags from them
def clean_places(places):
    placelist = []
    for i in places:
        tags = ET.fromstring(str(i))
        placelist.append(ET.tostring(tags, method='text').decode("utf-8"))
        
    return placelist

# Removes title, place and the XML tags from the text
def clean_text(text):
    for tag in text.findAll():
        tag.replaceWith('')
    
    text = text.text.replace('\n',' ').replace('\"', '').replace('\'', '').replace('\ ','')
    
    return re.sub('[\x7f]', '', text)

def clean_year(data):
    year = re.search('[0-9]+-[A-Z]+-[0-9]+', str(data)).group(0)
    return str(year).replace('-', ' ')

# Creates dictionary with all documents in the Reuters database, assigns doc_ID to all documents. 
def load_data():
    data_dict = {}
    data_id = 0
    
    with open('reuters.json', 'w') as f:
        for i in range(22):
            if i < 10:
                data1 = open('reuters_data/reut2-00'+ str(i) +'.sgm')
            else:
                data1 = open('reuters_data/reut2-0'+ str(i) +'.sgm')
            soup = BeautifulSoup(data1,'lxml')
            items = soup.findAll('reuters')

            for doc in items:
                ID =     {'index':{'_id':str(data_id)}}
                data =   {'date':clean_year(doc.date),
                          'topic':clean_data(doc.topics), 
                          'place':clean_places(doc.places.findAll('d')), 
                          'people':clean_data(doc.people), 
                          'orgs':clean_data(doc.orgs), 
                          'exchanges':clean_data(doc.exchanges), 
                          'companies':clean_data(doc.companies), 
                          'title':clean_data(doc.title), 
                          'text':clean_text(doc.findAll('text')[0]) 
                          }
                
                json.dump(ID, f)
                f.write('\n')
                json.dump(data, f)
                f.write('\n')

                print('Indexing document', data_id + 1, 'of 21578', end='\r')
                data_id += 1
        
    return 'done'

In [3]:
load_data()

Indexing document 21578 of 21578 21578 488 of 21578 825 of 21578of 21578 of 21578 of 21578 of 21578 3309 of 21578 3473 of 215783630 of 21578 3800 of 21578 of 21578 4654 of 21578of 21578 4987 of 21578 5383 of 21578 of 21578 of 21578 of 21578 of 21578 8405 of 21578 8587 of 21578 8746 of 21578 8914 of 21578 9386 of 21578 9565 of 21578 9899 of 21578 10341 of 21578of 21578of 21578 10978 of 21578 11916 of 2157812752 of 21578 12871 of 21578 13391 of 21578 of 21578 14170 of 21578 14355 of 21578 14555 of 21578 14884 of 2157815163 of 2157815302 of 21578 15391 of 21578of 21578 15701 of 21578 15871 of 21578 16192 of 21578 of 21578of 21578 16939 of 21578 of 21578of 21578 18375 of 2157818519 of 21578 18680 of 21578 18855 of 21578 of 21578 19505 of 21578 of 21578 20541 of 21578 20693 of 21578of 21578 21330 of 21578 21506 of 21578

'done'