In [1]:
import re
import numpy as np
import pandas as pd
from unidecode import unidecode

## Person Data

In [2]:
# 从txt中获取数据并查看长度
person = []
for line in open('persondata_en.txt', "r",encoding='UTF-8'):
    person.append(line)
len(person)

6067248

In [3]:
# 删除首尾行
person_data = person[1:6067247]

In [4]:
property1 = []
for j in person_data:
    # 不提取type行
    if 'type' in j:
        continue
    # 避免其他语言文字，全部换成英文
    j = unidecode(j)
    # 找到所有网址，删除最后一个
    last = re.findall('<http://.*?> ',j)
    j = j.replace(last[-1],'')
    # 正则提取三元组
    if '> \"' in j:
        result1 = re.findall('/[^/#]*>',j) #提取所有/>中除了/的字符
        result2 = re.findall('\".+\"',j) # 提取双引号中的内容
        property1.append(result1+result2)

In [5]:
# property放入dataframe
df1 = pd.DataFrame(property1, columns = ['node_name', 'property_name', 'property_content'])
print(len(df1))
df1.head()

4381695


Unnamed: 0,node_name,property_name,property_content
0,/Ibrahim_Pasha_of_Egypt>,/name>,"""Ibrahim Pasha"""
1,/Ibrahim_Pasha_of_Egypt>,/description>,"""Ottoman politician and general"""
2,/Ibrahim_Pasha_of_Egypt>,/birthDate>,"""1789"""
3,/Ibrahim_Pasha_of_Egypt>,/deathDate>,"""1848-11-10"""
4,/Poul_Kjaerholm>,/name>,"""Poul Kjaerholm"""


In [6]:
# 删除多余的符号
df1['node_name'] = df1['node_name'].str.replace('[/>]+', '')
df1['node_name'] = df1['node_name'].str.replace('_+', ' ')

df1['property_name'] = df1['property_name'].str.replace('[/>#]+', '')

df1['property_content'] = df1['property_content'].str.replace('[/>\"\"]+', '')
df1['property_content'] = df1['property_content'].str.replace('_+', ' ')
df1.head()

Unnamed: 0,node_name,property_name,property_content
0,Ibrahim Pasha of Egypt,name,Ibrahim Pasha
1,Ibrahim Pasha of Egypt,description,Ottoman politician and general
2,Ibrahim Pasha of Egypt,birthDate,1789
3,Ibrahim Pasha of Egypt,deathDate,1848-11-10
4,Poul Kjaerholm,name,Poul Kjaerholm


In [7]:
#查看df1中有哪些是空的
def find_all_index(arr,item):
    return [i for i,a in enumerate(arr) if a==item]
find_all_index(df1['property_content'], '')

[]

In [8]:
print(len(df1))
df1 = df1.dropna()
print(len(df1))

4381695
4381692


## Mapping Literals Object

In [9]:
# 从txt中获取数据并查看长度
literals = []
for line in open('mappingbased_literals_en.txt', "r",encoding='UTF-8'):
    literals.append(line)
len(literals)

16897943

In [10]:
literals = literals[1:16897942]

In [11]:
property2 = []
for j in literals:
    # 避免其他语言文字，全部换成英文
    j = unidecode(j)
    # 找到所有网址，删除最后一个
    last = re.findall('<http://.*?> ',j)
    if len(last) == 4:
        j = j.replace(last[-1],'')
        j = j.replace(last[-2],'')
    else:
        j = j.replace(last[-1],'')
    # 正则提取三元组
    if '> \"' in j:
        result1 = re.findall('/[^/]+>',j) #提取所有/>中除了/的字符
        result2 = re.findall('\".+\"',j) # 提取双引号中的内容
        property2.append(result1+result2)

In [12]:
# property放入dataframe
df2 = pd.DataFrame(property2, columns = ['node_name', 'property_name', 'property_content','others'])
print(len(df2))
df2.head()

16897941


Unnamed: 0,node_name,property_name,property_content,others
0,/Actrius>,/name>,"""Actresses""",
1,/Actrius>,/runtime>,"""6000.0""",
2,/Animalia_(book)>,/name>,"""Animalia""",
3,/Animalia_(book)>,/numberOfPages>,"""32""",
4,/Animalia_(book)>,/isbn>,"""0-810-91868-4""",


In [13]:
df2 = df2.drop('others', axis = 1)

In [14]:
df2

Unnamed: 0,node_name,property_name,property_content
0,/Actrius>,/name>,"""Actresses"""
1,/Actrius>,/runtime>,"""6000.0"""
2,/Animalia_(book)>,/name>,"""Animalia"""
3,/Animalia_(book)>,/numberOfPages>,"""32"""
4,/Animalia_(book)>,/isbn>,"""0-810-91868-4"""
...,...,...,...
16897936,/Miles_Byass__8>,/numberOfGoals>,"""4"""
16897937,/Miles_Byass__9>,/years>,"""2009"""
16897938,/Miles_Byass>,/name>,"""Byass, Miles"""
16897939,/Miles_Byass>,/birthYear>,"""1991"""


In [15]:
# 删除多余的符号
df2['node_name'] = df2['node_name'].str.replace('[/>]+', '')
df2['node_name'] = df2['node_name'].str.replace('[_\|]+', ' ')
df2['node_name'] = df2['node_name'].str.replace('%22', '')

df2['property_name'] = df2['property_name'].str.replace('[/>]+', '')
df2['property_name'] = df2['property_name'].str.replace('[#]+', ' ')

df2['property_content'] = df2['property_content'].str.replace('[/>\"\"]+', '')
df2['property_content'] = df2['property_content'].str.replace('[_\|]+', ' ')
df2.head()

Unnamed: 0,node_name,property_name,property_content
0,Actrius,name,Actresses
1,Actrius,runtime,6000.0
2,Animalia (book),name,Animalia
3,Animalia (book),numberOfPages,32
4,Animalia (book),isbn,0-810-91868-4


In [16]:
len(find_all_index(df2['property_content'], None))

105

In [17]:
print(len(df2))
df2 = df2.dropna()
print(len(df2))

16897941
16897836


In [18]:
find_all_index(df2['node_name'], '')

[]

## Combination

In [59]:
# 合并以上两个dataframe
# 去重
# 转换格式{"":""}
# 作为字典
# 导入entit_new.csv
# 加一列property

In [19]:
df3 = pd.concat([df1,df2], axis=0)

In [20]:
df3

Unnamed: 0,node_name,property_name,property_content
0,Ibrahim Pasha of Egypt,name,Ibrahim Pasha
1,Ibrahim Pasha of Egypt,description,Ottoman politician and general
2,Ibrahim Pasha of Egypt,birthDate,1789
3,Ibrahim Pasha of Egypt,deathDate,1848-11-10
4,Poul Kjaerholm,name,Poul Kjaerholm
...,...,...,...
16897936,Miles Byass 8,numberOfGoals,4
16897937,Miles Byass 9,years,2009
16897938,Miles Byass,name,"Byass, Miles"
16897939,Miles Byass,birthYear,1991


In [21]:
df3 = df3.drop_duplicates()
df3

Unnamed: 0,node_name,property_name,property_content
0,Ibrahim Pasha of Egypt,name,Ibrahim Pasha
1,Ibrahim Pasha of Egypt,description,Ottoman politician and general
2,Ibrahim Pasha of Egypt,birthDate,1789
3,Ibrahim Pasha of Egypt,deathDate,1848-11-10
4,Poul Kjaerholm,name,Poul Kjaerholm
...,...,...,...
16897935,Miles Byass 8,numberOfMatches,22
16897936,Miles Byass 8,numberOfGoals,4
16897937,Miles Byass 9,years,2009
16897938,Miles Byass,name,"Byass, Miles"


In [22]:
df3['property'] = '\''+df3['property_name']+'\': '+'\''+df3['property_content']+'\''
df_test = df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
df_test = df_test.drop('property_name', axis = 1)
df_test = df_test.drop('property_content', axis = 1)
df_test

Unnamed: 0,node_name,property
0,Ibrahim Pasha of Egypt,'name': 'Ibrahim Pasha'
1,Ibrahim Pasha of Egypt,'description': 'Ottoman politician and general'
2,Ibrahim Pasha of Egypt,'birthDate': '1789'
3,Ibrahim Pasha of Egypt,'deathDate': '1848-11-10'
4,Poul Kjaerholm,'name': 'Poul Kjaerholm'
...,...,...
16897935,Miles Byass 8,'numberOfMatches': '22'
16897936,Miles Byass 8,'numberOfGoals': '4'
16897937,Miles Byass 9,'years': '2009'
16897938,Miles Byass,"'name': 'Byass, Miles'"


In [24]:
def ab(df):
    return','.join(df.values)
    
df_test_1 = df_test.groupby(['node_name'])['property'].apply(ab)
df_test_1

node_name
 '12' Four 1                   'name': 'Triumph Gloria('12'  '12') Four','con...
 '73.                                         'name': 'Najveci uspjesi '68.'73.'
 (Santigold album)             'name': '99C','releaseDate': '2016-02-26','run...
 1                             'filename': 'Mary-marry-merry.ogg','title': 'M...
 10                                                             'type': 'speech'
                                                     ...                        
x Sophrolaeliocattleya                          'name': 'x Sophrolaeliocattleya'
x Sorbaronia mitschurinii                    'name': 'x Sorbaronia mitschurinii'
x Trichocidium                                          'name': 'x Trichocidium'
zidovudine                     'casNumber': '364057-50-1','atcPrefix': 'J05',...
zinc superoxide dismutase 1                                   'symbol': 'Sod Cu'
Name: property, Length: 4693764, dtype: object

In [26]:
df_test2 = pd.DataFrame(df_test_1)

In [27]:
df_test2

Unnamed: 0_level_0,property
node_name,Unnamed: 1_level_1
'12' Four 1,"'name': 'Triumph Gloria('12' '12') Four','con..."
'73.,'name': 'Najveci uspjesi '68.'73.'
(Santigold album),"'name': '99C','releaseDate': '2016-02-26','run..."
1,"'filename': 'Mary-marry-merry.ogg','title': 'M..."
10,'type': 'speech'
...,...
x Sophrolaeliocattleya,'name': 'x Sophrolaeliocattleya'
x Sorbaronia mitschurinii,'name': 'x Sorbaronia mitschurinii'
x Trichocidium,'name': 'x Trichocidium'
zidovudine,"'casNumber': '364057-50-1','atcPrefix': 'J05',..."


In [28]:
df_test_3 = pd.DataFrame(zip(df_test2.index,df_test2['property']),columns = ['name','property'])
df_test_3

Unnamed: 0,name,property
0,'12' Four 1,"'name': 'Triumph Gloria('12' '12') Four','con..."
1,'73.,'name': 'Najveci uspjesi '68.'73.'
2,(Santigold album),"'name': '99C','releaseDate': '2016-02-26','run..."
3,1,"'filename': 'Mary-marry-merry.ogg','title': 'M..."
4,10,'type': 'speech'
...,...,...
4693759,x Sophrolaeliocattleya,'name': 'x Sophrolaeliocattleya'
4693760,x Sorbaronia mitschurinii,'name': 'x Sorbaronia mitschurinii'
4693761,x Trichocidium,'name': 'x Trichocidium'
4693762,zidovudine,"'casNumber': '364057-50-1','atcPrefix': 'J05',..."


In [32]:
df_test_3['property'] = df_test_3['property'].str.replace('[\']+', '')

In [34]:
df_test_3['property'] = df_test_3['property'].str.replace(',', ', ')
df_test_3['property'] = df_test_3['property'].str.replace('\\', '')
df_test_3

Unnamed: 0,name,property
0,'12' Four 1,"name: Triumph Gloria(12 12) Four, configurat..."
1,'73.,name: Najveci uspjesi 68.73.
2,(Santigold album),"name: 99C, releaseDate: 2016-02-26, runtime:..."
3,1,"filename: Mary-marry-merry.ogg, title: Mary-m..."
4,10,type: speech
...,...,...
4693759,x Sophrolaeliocattleya,name: x Sophrolaeliocattleya
4693760,x Sorbaronia mitschurinii,name: x Sorbaronia mitschurinii
4693761,x Trichocidium,name: x Trichocidium
4693762,zidovudine,"casNumber: 364057-50-1, atcPrefix: J05, atcS..."


In [35]:
pro_dict = df_test_3.set_index('name')['property'].to_dict()

In [36]:
entity = pd.read_csv('entity.csv',encoding = 'utf-8')
entity

Unnamed: 0.1,Unnamed: 0,entityName,entityId:ID
0,1,"Iraqi parliamentary election, September 1954",0
1,2,New Road Team,1
2,3,Here Comes Trouble (Bad Company album),2
3,4,Canna glauca,3
4,5,Povla Frijsh,4
...,...,...,...
5822928,5822929,German submarine U-133 (1941),5822928
5822929,5822930,Rengarajan Jaiprakash,5822929
5822930,5822931,Jean d'Eaubonne,5822930
5822931,5822932,"Jack Price (footballer, born 1877) 2",5822931


In [37]:
entity = entity.drop('Unnamed: 0', axis = 1)
entity['entityProperty'] = entity['entityName']
entity['entityProperty'] = entity['entityProperty'].map(pro_dict)

In [38]:
entity

Unnamed: 0,entityName,entityId:ID,entityProperty
0,"Iraqi parliamentary election, September 1954",0,"title: Iraqi parliamentary election, Septemb..."
1,New Road Team,1,"name: New Road, name: New Road Team, nick: N..."
2,Here Comes Trouble (Bad Company album),2,"name: Here Comes Trouble, releaseDate: 1992-0..."
3,Canna glauca,3,"synonym: * Canna angustifolia (L.), synonym: ..."
4,Povla Frijsh,4,"name: Povla Frijsh, surname: Frijsh, givenNa..."
...,...,...,...
5822928,German submarine U-133 (1941),5822928,"name: U-133, orderDate: 1939-08-07, layingDo..."
5822929,Rengarajan Jaiprakash,5822929,"name: Jaiprakash Rengarajan, description: Ind..."
5822930,Jean d'Eaubonne,5822930,"name: Jean d Eaubonne, surname: Eaubonne, gi..."
5822931,"Jack Price (footballer, born 1877) 2",5822931,"years: 1897, numberOfMatches: 1, numberOfGoa..."


In [42]:
# 一共5866138entity
# 有1175037 没有匹配到property
entity = entity[entity['entityProperty'].notnull()]

In [43]:
entity.to_csv('entity.csv')