In [1]:
##### Imports ######
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import json
import re

In [56]:
# Removes XML tags from the data
def clean_data(data):
    try:
        tags = ET.fromstring(str(data))
        return ET.tostring(tags, method='text').decode("utf-8").replace('\'', '').replace('\ ', '').replace('\"','')
    except:
        return ''

# Finds all different places and removes the XML tags from them
def clean_places(places):
    placelist = []
    for i in places:
        tags = ET.fromstring(str(i))
        placelist.append(ET.tostring(tags, method='text').decode("utf-8"))
        
    return placelist

# Removes title, place and the XML tags from the text
def clean_text(text):
    for tag in text.findAll():
        tag.replaceWith('')
    
    text = text.text.replace('\n',' ').replace('\"', '').replace('\'', '').replace('\ ','')
    
    return re.sub('[\x7f]', '', text)

def clean_year(data):
    year = re.search('[0-9][0-9][0-9][0-9]', str(data)).group(0)
    return str(year)

# Creates dictionary with all documents in the Reuters database, assigns doc_ID to all documents. 
def load_data():
    data_dict = {}
    data_id = 0
    
    with open('reuters.json', 'w') as f:
        for i in range(22):
            if i < 10:
                data1 = open('reuters_data/reut2-00'+ str(i) +'.sgm')
            else:
                data1 = open('reuters_data/reut2-0'+ str(i) +'.sgm')
            soup = BeautifulSoup(data1,'lxml')
            items = soup.findAll('reuters')

            for doc in items:
                ID =     {'index':{'_id':str(data_id)}}
                data =   {    'date':clean_year(doc.date),
                              'topic':clean_data(doc.topics), 
                              'place':clean_places(doc.places.findAll('d')), 
                              'people':clean_data(doc.people), 
                              'orgs':clean_data(doc.orgs), 
                              'exchanges':clean_data(doc.exchanges), 
                              'companies':clean_data(doc.companies), 
                              'title':clean_data(doc.title), 
                              'text':clean_text(doc.findAll('text')[0]) 
                          }
                
                json.dump(ID, f)
                json.dump(data, f)

                print('Indexing document', data_id + 1, 'of 21578', end='\r')
                data_id += 1
        
#         f = f.replace('\'', '\"')
    
    return 'done'

load_data()

Indexing document 21578 of 2157836 of 21578 812 of 21578 1140 of 21578 1280 of 215781417 of 21578 1663 of 21578of 21578 2278 of 21578 2417 of 21578 2779 of 21578 2922 of 215783407 of 21578 3660 of 21578 3779 of 21578 of 21578 4273 of 21578 4791 of 21578 of 21578 5144 of 21578 of 21578 5424 of 21578 5553 of 21578 of 21578 5802 of 21578 5932 of 21578 6140 of 21578of 21578 6534 of 21578 6660 of 21578 6799 of 21578 7218 of 215787354 of 21578 7479 of 21578of 21578 7879 of 21578 8140 of 21578 8278 of 21578 8694 of 215788817 of 21578 9273 of 21578of 21578 of 21578 9678 of 21578 10146 of 21578 10527 of 21578 10653 of 21578 10781 of 21578 of 21578 11212 of 21578 11343 of 21578 11466 of 21578 11597 of 21578 11982 of 2157812143 of 21578 12288 of 2157812423 of 2157812563 of 2157812689 of 21578 12819 of 21578 13347 of 21578of 2157813795 of 21578 13936 of 21578 14162 of 21578 of 21578 14587 of 21578of 21578 of 21578 15501 of 21578 of 2157815752 of 21578 15889 of 21578 16284 of 21578 16418 of 21578 1

'done'

In [46]:
with open('test.txt', 'w') as f:
    for i in range(5):
        f.write(str(i))

In [49]:
with open('test.txt', 'r') as f:
    print(f.read())

01234
