In [1]:
import re
import numpy as np
import pandas as pd
from unidecode import unidecode

## Person Data

In [2]:
# 从txt中获取数据并查看长度
person = []
for line in open('persondata_en.txt', "r",encoding='UTF-8'):
    person.append(line)
len(person)

6067248

In [3]:
# 删除首尾行
person_data = person[1:6067247]

In [4]:
relationship1 = []
for j in person_data:
    # 不提取type行
    if 'type' in j:
        continue
    # change foreign characters to english letters
    j = unidecode(j)
    # find all of websites and extract the first three
    last = re.findall('<http://.*?> ',j)
    j = j.replace(last[-1],'')
    # use regular expression to extract all entity names
    if '> \"' not in j:
        result = re.findall('/[^/#]*>',j)
        relationship1.append(result)

In [5]:
# relationship放入dataframe
df2 = pd.DataFrame(relationship1, columns = ['start_name', 'relationship', 'end_name'])
print(len(df2))
df2.head()

826766


Unnamed: 0,start_name,relationship,end_name
0,/Ibrahim_Pasha_of_Egypt>,/birthPlace>,"/Drama,_Greece>"
1,/Ibrahim_Pasha_of_Egypt>,/birthPlace>,/Ottoman_Empire>
2,/Ibrahim_Pasha_of_Egypt>,/deathPlace>,/Cairo>
3,/Ibrahim_Pasha_of_Egypt>,/deathPlace>,/Egypt>
4,/Abdul_Qadeer_Khan>,/birthPlace>,/Bhopal>


In [6]:
# 删除多余的符号
df2['start_name'] = df2['start_name'].str.replace('[/>]+', '')
df2['start_name'] = df2['start_name'].str.replace('_+', ' ')

df2['relationship'] = df2['relationship'].str.replace('[/>#]+', '')

df2['end_name'] = df2['end_name'].str.replace('[/>\"\"]+', '')
df2['end_name'] = df2['end_name'].str.replace('_+', ' ')
df2.head()

Unnamed: 0,start_name,relationship,end_name
0,Ibrahim Pasha of Egypt,birthPlace,"Drama, Greece"
1,Ibrahim Pasha of Egypt,birthPlace,Ottoman Empire
2,Ibrahim Pasha of Egypt,deathPlace,Cairo
3,Ibrahim Pasha of Egypt,deathPlace,Egypt
4,Abdul Qadeer Khan,birthPlace,Bhopal


In [7]:
#查看有哪些end_name提取的数据是空的或出错的
def find_all_index(arr,item):
    return [i for i,a in enumerate(arr) if a==item]
find_all_index(df2['end_name'], None)

[575016, 629331, 709977]

In [8]:
print(len(df2))
df2 = df2.dropna()
print(len(df2))

826766
826763


## Mapping Based Object

In [9]:
# 从txt中获取数据并查看长度
mapping = []
for line in open('mappingbased_objects_en.txt', "r",encoding='UTF-8'):
    mapping.append(line)
len(mapping)

18295012

In [10]:
# 删除第一行和最后一行
mapping_data = mapping[1:18295011]

In [47]:
# 删除最后一列网址，正则匹配三元组
relationship2 = []
for j in mapping_data:
    if 'seeAlso' in j:
        continue
    if 'differentFrom' in j:
        continue
    j = unidecode(j)
    last = re.findall('<http://.*?> ',j)
    j = j.replace(last[-1],'')
    result = re.findall('/[^/]*>',j)
    relationship2.append(result)

In [48]:
# 放入dataframe
df3 = pd.DataFrame(relationship2, columns = ['start_name', 'relationship', 'end_name','others'])
df3

Unnamed: 0,start_name,relationship,end_name,others
0,/Actrius>,/director>,/Ventura_Pons>,
1,/Actrius>,/producer>,/Ventura_Pons>,
2,/Actrius>,/writer>,/Josep_Maria_Benet_i_Jornet>,
3,/Actrius>,/distributor>,/Walt_Disney_Studios_Motion_Pictures>,
4,/Actrius>,/country>,/Spain>,
...,...,...,...,...
18077913,/Miles_Byass>,/team>,/Seattle_Sounders_FC_U-23>,
18077914,/Miles_Byass>,/team>,/FF_Jaro>,
18077915,/Miles_Byass>,/team>,/MYPA>,
18077916,/Miles_Byass>,/team>,/JIPPO>,


In [49]:
# 删除多余的符号
df3['start_name'] = df3['start_name'].str.replace('[/>]+', '')
df3['start_name'] = df3['start_name'].str.replace('_+', ' ')

df3['end_name'] = df3['end_name'].str.replace('[/>]+', '')
df3['end_name'] = df3['end_name'].str.replace('_+', ' ')

df3['relationship'] = df3['relationship'].str.replace('[/>]+', '')
df3['relationship'] = df3['relationship'].str.replace('_+', ' ')

df3['others'] = df3['others'].str.replace('[/>\"\"]+', '')
print(len(df3))
df3.head()

18077918


Unnamed: 0,start_name,relationship,end_name,others
0,Actrius,director,Ventura Pons,
1,Actrius,producer,Ventura Pons,
2,Actrius,writer,Josep Maria Benet i Jornet,
3,Actrius,distributor,Walt Disney Studios Motion Pictures,
4,Actrius,country,Spain,


In [50]:
set(df3['others'])

{None, '_S__1'}

In [51]:
#分到others里的只有两行，是多余的网址，可以直接删除，所以drop掉others
df3 = df3.drop('others', axis = 1)
print(len(df3))

18077918


In [52]:
# relationship列没有空的
def find_all_index(arr,item):
    return [i for i,a in enumerate(arr) if a==item]
find_all_index(df3['relationship'], None)

[]

In [53]:
# 查找end_name列空的原因
find_all_index(df3['end_name'], None)

[188352, 2725964, 3290850, 8642888, 16520384]

In [54]:
# 原因是第三列的网址是一个没有/的网址，无法提取
mapping_data[ 3313229:  3313230]

['<http://dbpedia.org/resource/Arne_Skaug> <http://www.w3.org/2002/07/owl#differentFrom> <http://dbpedia.org/resource/Arne_Skauge> <http://en.wikipedia.org/wiki/Arne_Skaug?oldid=707169133#absolute-line=1&template=Distinguish&property=1&split=1&wikiTextSize=11&plainTextSize=11&valueSize=11> .\n']

In [55]:
# 删除那几个空值
df3 = df3.dropna()
print(len(df3))

18077913


## Ontology

In [23]:
# 从txt中获取数据并查看长度
onto = []
for line in open('dbpedia_2016-04.txt', "r",encoding='UTF-8'):
    onto.append(line)
len(onto)

30793

In [24]:
relationship4 = []
for j in onto:
    j = unidecode(j)
    if '#type>' in j or '#subClassOf>' in j:
        result1 = re.search('/[^/#]*>',j)
        start_name = []
        if result1:
            start_name.append(result1.group(0))
        else:
            continue
        result2 = re.findall('[/|#]+[^/#]+>',j)
        result2 = result2[1:]
        relationship4.append(start_name+result2)

In [25]:
df5 = pd.DataFrame(relationship4, columns = ['start_name','relationship','end_name'])

In [26]:
df5 = df5[2:]
df5

Unnamed: 0,start_name,relationship,end_name
2,/BasketballLeague>,#type>,#Class>
3,/BasketballLeague>,#subClassOf>,/SportsLeague>
4,/NaturalEvent>,#type>,#Class>
5,/NaturalEvent>,#subClassOf>,/Event>
6,/Province>,#type>,#Class>
...,...,...,...
7467,/areaOfCatchment>,#type>,#DatatypeProperty>
7468,/dryCargo>,#type>,#DatatypeProperty>
7469,/stationEvaDuration>,#type>,#DatatypeProperty>
7470,/volume>,#type>,#DatatypeProperty>


In [27]:
# 删除多余的符号
df5['start_name'] = df5['start_name'].str.replace('[/>#]+', '')

df5['end_name'] = df5['end_name'].str.replace('[/>#]+', '')

df5['relationship'] = df5['relationship'].str.replace('[/>#]+', '')

print(len(df5))
df5.head()

7470


Unnamed: 0,start_name,relationship,end_name
2,BasketballLeague,type,Class
3,BasketballLeague,subClassOf,SportsLeague
4,NaturalEvent,type,Class
5,NaturalEvent,subClassOf,Event
6,Province,type,Class


## Instance_types

In [28]:
# 从txt中获取数据并查看长度
types = []
for line in open('instance_types_en.txt', "r",encoding='UTF-8'):
    types.append(line)
len(types)

5214242

In [29]:
types = types[1:5214240]

In [30]:
relationship5 = []
for j in types:
    j = unidecode(j)
    # 找到所有网址，删除最后一个
    last = re.findall('<http://.*?> ',j)
    j = j.replace(last[-1],'')
    result1 = re.search('/[^/]*>',j)
    start_name = []
    if result1:
        start_name.append(result1.group(0))
    else:
        continue
    result2 = re.findall('[/|#]+[^/#]+>',j)
    result2 = result2[1:]
    relationship5.append(start_name+result2)

In [31]:
df6 = pd.DataFrame(relationship5, columns = ['start_name','relationship','end_name','others'])

In [32]:
# 删除多余的符号
df6['start_name'] = df6['start_name'].str.replace('[/>#]+', '')
df6['start_name'] = df6['start_name'].str.replace('_+', ' ')

df6['end_name'] = df6['end_name'].str.replace('[/>#]+', '')

df6['relationship'] = df6['relationship'].str.replace('[/>#]+', '')

print(len(df6))
df6.head()

5214239


Unnamed: 0,start_name,relationship,end_name,others
0,Achilles,type,Thing,
1,An American in Paris,type,Thing,
2,Actrius,type,Film,
3,Animalia (book),type,Book,
4,Agricultural science,type,Thing,


In [33]:
df6 = df6.drop('others', axis = 1)
df6.head()

Unnamed: 0,start_name,relationship,end_name
0,Achilles,type,Thing
1,An American in Paris,type,Thing
2,Actrius,type,Film
3,Animalia (book),type,Book
4,Agricultural science,type,Thing


## Combination

In [34]:
print(len(df2))
df2.head()

826763


Unnamed: 0,start_name,relationship,end_name
0,Ibrahim Pasha of Egypt,birthPlace,"Drama, Greece"
1,Ibrahim Pasha of Egypt,birthPlace,Ottoman Empire
2,Ibrahim Pasha of Egypt,deathPlace,Cairo
3,Ibrahim Pasha of Egypt,deathPlace,Egypt
4,Abdul Qadeer Khan,birthPlace,Bhopal


In [56]:
print(len(df3))
df3.head()

18077913


Unnamed: 0,start_name,relationship,end_name
0,Actrius,director,Ventura Pons
1,Actrius,producer,Ventura Pons
2,Actrius,writer,Josep Maria Benet i Jornet
3,Actrius,distributor,Walt Disney Studios Motion Pictures
4,Actrius,country,Spain


In [36]:
print(len(df5))
df5.head()

7470


Unnamed: 0,start_name,relationship,end_name
2,BasketballLeague,type,Class
3,BasketballLeague,subClassOf,SportsLeague
4,NaturalEvent,type,Class
5,NaturalEvent,subClassOf,Event
6,Province,type,Class


In [37]:
print(len(df6))
df6.head()

5214239


Unnamed: 0,start_name,relationship,end_name
0,Achilles,type,Thing
1,An American in Paris,type,Thing
2,Actrius,type,Film
3,Animalia (book),type,Book
4,Agricultural science,type,Thing


In [57]:
# combine two relationship dataframe
df7 = pd.concat([df2,df3,df5,df6], axis=0)
df7

Unnamed: 0,start_name,relationship,end_name
0,Ibrahim Pasha of Egypt,birthPlace,"Drama, Greece"
1,Ibrahim Pasha of Egypt,birthPlace,Ottoman Empire
2,Ibrahim Pasha of Egypt,deathPlace,Cairo
3,Ibrahim Pasha of Egypt,deathPlace,Egypt
4,Abdul Qadeer Khan,birthPlace,Bhopal
...,...,...,...
5214234,Miles Byass 5,type,CareerStation
5214235,Miles Byass 6,type,CareerStation
5214236,Miles Byass 7,type,CareerStation
5214237,Miles Byass 8,type,CareerStation


In [58]:
df7 = df7.rename(columns={'start_name':':START_ID','relationship':':TYPE','end_name':':END_ID'})
df7

Unnamed: 0,:START_ID,:TYPE,:END_ID
0,Ibrahim Pasha of Egypt,birthPlace,"Drama, Greece"
1,Ibrahim Pasha of Egypt,birthPlace,Ottoman Empire
2,Ibrahim Pasha of Egypt,deathPlace,Cairo
3,Ibrahim Pasha of Egypt,deathPlace,Egypt
4,Abdul Qadeer Khan,birthPlace,Bhopal
...,...,...,...
5214234,Miles Byass 5,type,CareerStation
5214235,Miles Byass 6,type,CareerStation
5214236,Miles Byass 7,type,CareerStation
5214237,Miles Byass 8,type,CareerStation


In [59]:
print(len(df7))
df7 = df7.dropna()
print(len(df7))

24126385
24126381


In [60]:
len(find_all_index(df7[':END_ID'], ''))

182257

In [61]:
dt999=df7[(df7[':END_ID']!='')]

In [62]:
find_all_index(dt999[':END_ID'], '')

[]

In [63]:
len(dt999)

23944124

In [64]:
dt999.to_csv('relationship.csv')