In [5]:
import pandas as pd
import json

In [6]:
def load_data(path):
    data = pd.read_csv(path, header=None, sep="\n")
    data_dict = {
        'key':[],
        'id':[],
        'label':[],
        'categories':[],
        'links':[],
        'external_links':[]
    }

    for _, row in data.iterrows():
        row_json = json.loads(row[0])
        
        data_dict['key'].append(row_json['key'])
        data_dict['id'].append(row_json['value']['id']['int'])
        label = row_json['value']['pageLabel']['string'] if row_json['value']['pageLabel'] != None else ""  
        data_dict['label'].append(label)
        data_dict['categories'].append(row_json['value']['categories'])
        data_dict['links'].append(row_json['value']['links'])
        data_dict['external_links'].append(row_json['value']['externalLinks'])

    return pd.DataFrame(data_dict)

In [7]:
data0 = load_data("output_json/part0.json")
data1 = load_data("output_json/part1.json")
data2 = load_data("output_json/part2.json")
data3 = load_data("output_json/part3.json")

In [8]:
full_data = pd.concat((data0, data1, data2, data3), ignore_index=True, sort=False)

In [12]:
len(full_data) == len(data0) + len(data1) + len(data2) + len(data3)

True

In [21]:
full_data.head(2)

Unnamed: 0,key,id,label,categories,links,external_links
0,<http://sk.dbpedia.org/resource/%22Heroes%22>,474367,"\""Heroes\""",[],[http://sk.dbpedia.org/resource/Heroes],[]
1,<http://sk.dbpedia.org/resource/%22ale%22>,449263,"\""ale\""",[],[http://sk.dbpedia.org/resource/Ale_(album)],[]


In [40]:
def correct_quotation_marks(x):
    x.label = x.label.replace('\\"', '"')
    return x

In [108]:
full_data = full_data.apply(correct_quotation_marks, axis=1)

In [48]:
full_data.head(2)

Unnamed: 0,key,id,label,categories,links,external_links
0,<http://sk.dbpedia.org/resource/%22Heroes%22>,474367,"\""Heroes\""",[],[http://sk.dbpedia.org/resource/Heroes],[]
1,<http://sk.dbpedia.org/resource/%22ale%22>,449263,"\""ale\""",[],[http://sk.dbpedia.org/resource/Ale_(album)],[]


In [80]:
full_data.id.isnull().any()

False

In [109]:
full_data[['id', 'label']].to_records()

rec.array([(     0, 474367, '"Heroes"'), (     1, 449263, '"ale"'),
           (     2, 183674, '%'), ..., (354717,  39368, '♀'),
           (354718, 233793, '♄'), (354719, 138675, '♠')],
          dtype=[('index', '<i8'), ('id', '<i8'), ('label', 'O')])

In [112]:
records = full_data[['id', 'label', 'categories']].to_records(index=False)

In [129]:
inverted_indices = {}

def add_inverted_index(key, value):
    l = inverted_indices.get(key, list())
    l.append(value)
    inverted_indices[key] = l

for record in records:
    add_inverted_index(record.label, record.id)

    for category in record.categories:
        add_inverted_index(category, record.id)

In [137]:
inverted_indices_for_df = {'Term': [], 'Inverted index': []}
for key, value in inverted_indices.items():
    inverted_indices_for_df['Term'].append(key)
    inverted_indices_for_df['Inverted index'].append(value)


In [142]:
pd.DataFrame(inverted_indices_for_df).to_csv('index', index=False)