In [109]:
import pandas as pd
import json

In [110]:
data0 = pd.read_csv("output_json/part0.json", header=None, sep="\n")

In [111]:
data0.head(2)

Unnamed: 0,0
0,"{""key"":""<http://sk.dbpedia.org/resource/%22Her..."
1,"{""key"":""<http://sk.dbpedia.org/resource/%22ale..."


In [112]:
def load_data(path):
    data = pd.read_csv(path, header=None, sep="\n")
    data_dict = {
        'key':[],
        'id':[],
        'label':[],
        'categories':[],
        'links':[],
        'external_links':[]
    }

    for _, row in data.iterrows():
        row_json = json.loads(row[0])
        
        data_dict['key'].append(row_json['key'])
        data_dict['id'].append(row_json['value']['id']['int'])
        label = row_json['value']['pageLabel']['string'] if row_json['value']['pageLabel'] != None else ""  
        data_dict['label'].append(label)
        data_dict['categories'].append(row_json['value']['categories'])
        data_dict['links'].append(row_json['value']['links'])
        data_dict['external_links'].append(row_json['value']['externalLinks'])

    return pd.DataFrame(data_dict)

In [113]:
data0 = load_data("output_json/part0.json")
data1 = load_data("output_json/part1.json")
data2 = load_data("output_json/part2.json")
data3 = load_data("output_json/part3.json")

----------
Let's count **number of pages**

In [114]:
data0.size + data1.size + data2.size + data3.size

2128320

----------
Check whether there are some **duplicates**

In [115]:
r1 = data0.key.duplicated().any()
r2 = data1.key.duplicated().any()
r3 = data2.key.duplicated().any()
r4 = data3.key.duplicated().any()
print(r1, r2, r3, r4)

False False False False


So, there are **no** duplicates
_______________________


Let's find out a site with the highest number of categories 

In [116]:
max_i0 = data0.categories.apply(lambda x: len(x)).idxmax()
max_i1 = data1.categories.apply(lambda x: len(x)).idxmax()
max_i2 = data2.categories.apply(lambda x: len(x)).idxmax()
max_i3 = data3.categories.apply(lambda x: len(x)).idxmax() 

In [117]:
len(data0.iloc[max_i0].categories) , len(data1.iloc[max_i1].categories), len(data2.iloc[max_i2].categories), len(data3.iloc[max_i3].categories)

(62, 28, 29, 31)

In [118]:
data0.iloc[max_i0].label

'Angličtina'

In [119]:
data0.iloc[max_i0].key

'<http://sk.dbpedia.org/resource/Angličtina>'

In [120]:
data0.iloc[max_i0].categories

['Jazyky_v_Antigue_a_Barbude',
 'Jazyky_v_Zimbabwe',
 'Jazyky_v_Zambii',
 'Jazyky_v_USA',
 'Jazyky_vo_Vanuatu',
 'Jazyky_v_Ugande',
 'Jazyky_v_Tuvalu',
 'Jazyky_v_Tonge',
 'Jazyky_v_Trinidade_a_Tobagu',
 'Jazyky_v_Tanzánii',
 'Jazyky_na_Šalamúnových_ostrovoch',
 'Jazyky_na_Svätom_Vincente_a_Grenadínach',
 'Jazyky_na_Svätom_Krištofe_a_Nevise',
 'Jazyky_na_Svätej_Lucii',
 'Jazyky_v_Svazijsku',
 'Jazyky_v_Sudáne',
 'Jazyky_v_Spojenom_kráľovstve',
 'Jazyky_v_Singapure',
 'Jazyky_v_Sierra_Leone',
 'Jazyky_na_Seychelách',
 'Jazyky_v_Samoe',
 'Jazyky_v_Rwande',
 'Jazyky_v_Papui-Novej_Guinei',
 'Jazyky_v_Palau',
 'Jazyky_v_Pakistane',
 'Jazyky_na_Novom_Zélande',
 'Jazyky_v_Nigérii',
 'Jazyky_v_Nauru',
 'Jazyky_v_Namíbii',
 'Jazyky_v_Mikronézskych_federatívnych_štátoch',
 'Jazyky_na_Mauríciu',
 'Jazyky_na_Marshallových_ostrovoch',
 'Jazyky_na_Malte',
 'Jazyky_v_Malawi',
 'Jazyky_v_Libérii',
 'Jazyky_v_Lesothe',
 'Jazyky_v_Kiribati',
 'Jazyky_v_Keni',
 'Jazyky_v_Kanade',
 'Jazyky_v_Kamerune',
 '

-----------
External links

In [121]:

max_i0 = data0.external_links.apply(lambda x: len(x)).idxmax()
max_i1 = data1.external_links.apply(lambda x: len(x)).idxmax()
max_i2 = data2.external_links.apply(lambda x: len(x)).idxmax()
max_i3 = data3.external_links.apply(lambda x: len(x)).idxmax() 

In [122]:
len(data0.iloc[max_i0].external_links) , len(data1.iloc[max_i1].external_links), len(data2.iloc[max_i2].external_links), len(data3.iloc[max_i3].external_links)

(154, 489, 493, 254)

In [123]:
data2.iloc[max_i2].key

'<http://sk.dbpedia.org/resource/Zoznam_ostrovov_Írska>'

In [124]:
data2.iloc[max_i2].external_links


 'http://www.logainm.ie/13005.aspx',
 'http://maps.osi.ie/publicviewer/#V1,503375,527650,5',
 'http://www.logainm.ie/13016.aspx',
 'http://maps.osi.ie/publicviewer/#V1,522139,531462,6',
 'http://www.logainm.ie/13263.aspx',
 'http://maps.osi.ie/publicviewer/#V1,488482,516379,6',
 'http://www.logainm.ie/1165639.aspx',
 'http://maps.osi.ie/publicviewer/#V1,501635,525166,4',
 'http://www.logainm.ie/543.aspx',
 'http://maps.osi.ie/publicviewer/#V1,496008,521691,4',
 'http://www.logainm.ie/20721.aspx',
 'http://www.ntnu.no/ub/formidl/utgivelser/til_opplysning/to_nr16.pdf',
 'http://www.irelandsislands.ie/',
 'http://irishislands.info/',
 'http://www.oileain.ie/en/',
 'http://www.cso.ie/census/documents/census2006_Volume%201%20-%20Table%206.pdf',
 'http://books.google.com/books?id=UZ1bAAAAQAAJ&pg=RA4-PA63',
 'http://books.google.com/books?id=UZ1bAAAAQAAJ&pg=RA6-PA1',
 'http://books.google.com/books?id=UZ1bAAAAQAAJ&pg=RA5-PA1',
 'http://books.google.com/books?id=UZ1bAAAAQAAJ&pg=RA4-PA1',
 'ht