In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
# 引入中文分類
dim_noun_list = []
for line in open('./dim_noun.txt').readlines():
    [wd, keys_zh, keys_en] = line.split()
    line = {'class':wd,'keys_zh':keys_zh.replace('/','').split(','), 'keys_en':keys_en.replace('/','').split(',')}
    dim_noun_list.append(line)

dim_noun_pd = pd.DataFrame(dim_noun_list)
dim_noun_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   class    23 non-null     object
 1   keys_zh  23 non-null     object
 2   keys_en  23 non-null     object
dtypes: object(3)
memory usage: 680.0+ bytes


In [3]:
dim_noun_pd.head()

Unnamed: 0,class,keys_zh,keys_en
0,病症,"[病症, 病, 症狀]","[illness, symptom]"
1,電子產品,[電子產品],[electronic_device]
2,動物,[動物],[animal]
3,服裝,"[服裝, 衣服]",[clothes]
4,葷肉,"[葷, 肉類]",[meat]


In [22]:
# 取值標簽
# "label": "桌子",
# "language": "zh",
# "term": "/c/zh/桌子"
# {edge -> [weight, start:[label, language, term]]}

In [4]:
def get_nodes(end_word, lang, rel, limitation=1000):
    # save_pd = pd.DataFrame(col=['key', 'relation', 'example', 'language', 'concat', 'weight'])
    node_list = []
    res_pattern = 'https://api.conceptnet.io/query?end=/c/{}/{}&rel={}&limit={}'
    obj = requests.get(res_pattern.format(lang, end_word, rel, limitation)).json()
    if len(obj['edges']) > 0:
        for node in obj['edges']:
            weight = node['weight']
            start = node['start']

            label = start['label']
            lang = start['language']
            term = start['term']

            node_list.append({
                'key': end_word,
                'relation': rel,
                'example': label,
                'language': lang,
                'concat': term,
                'weight': weight
            })

    save_pd = pd.DataFrame(node_list)
    return save_pd

In [7]:
get_nodes('動物','zh','/r/IsA',limitation=1000)

Unnamed: 0,key,relation,example,language,concat,weight
0,動物,/r/IsA,人,zh,/c/zh/人,15.362291
1,動物,/r/IsA,貓,zh,/c/zh/貓,15.099669
2,動物,/r/IsA,狗,zh,/c/zh/狗,14.696938
3,動物,/r/IsA,小雞,zh,/c/zh/小雞,12.000000
4,動物,/r/IsA,貓咪,zh,/c/zh/貓咪,10.000000
...,...,...,...,...,...,...
108,動物,/r/IsA,無尾熊,zh,/c/zh/無尾熊,1.000000
109,動物,/r/IsA,女性,zh,/c/zh/女性,1.000000
110,動物,/r/IsA,黃金鼠,zh,/c/zh/黃金鼠,1.000000
111,動物,/r/IsA,你,zh,/c/zh/你,1.000000


In [8]:
# 搜集全部pd
rel = '/r/IsA'
limitation = 1000

zh_class_pd = pd.DataFrame()
en_class_pd = pd.DataFrame()
for line in dim_noun_list:
    zh_pd = pd.DataFrame()
    for key_zh in line['keys_zh']:
        if len(keys_zh) < 1:
            break
        zh_pd_single = get_nodes(key_zh,'zh',rel,limitation=limitation)
        zh_pd = pd.concat([zh_pd, zh_pd_single], axis=0)

    en_pd = pd.DataFrame()
    for key_en in line['keys_en']:
        if len(key_en) < 1:
            break
        en_pd_single = get_nodes(key_en,'en',rel,limitation=limitation)
        en_pd = pd.concat([en_pd, en_pd_single], axis=0)

    zh_pd['class'] = line['class']
    en_pd['class'] = line['class']

    zh_class_pd = pd.concat([zh_class_pd, zh_pd], axis=0)
    en_class_pd = pd.concat([en_class_pd, en_pd], axis=0)

In [9]:
en_class_pd = en_class_pd[en_class_pd['language']=='en'].reset_index(drop=True)
en_class_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5801 entries, 0 to 5800
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   key       5801 non-null   object 
 1   relation  5801 non-null   object 
 2   example   5801 non-null   object 
 3   language  5801 non-null   object 
 4   concat    5801 non-null   object 
 5   weight    5801 non-null   float64
 6   class     5801 non-null   object 
dtypes: float64(1), object(6)
memory usage: 317.4+ KB


In [10]:
zh_class_pd = zh_class_pd[zh_class_pd['language']=='zh'].reset_index(drop=True)
zh_class_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1216 entries, 0 to 1215
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   key       1216 non-null   object 
 1   relation  1216 non-null   object 
 2   example   1216 non-null   object 
 3   language  1216 non-null   object 
 4   concat    1216 non-null   object 
 5   weight    1216 non-null   float64
 6   class     1216 non-null   object 
dtypes: float64(1), object(6)
memory usage: 66.6+ KB


In [11]:
# 对中英文进行查重
zh_class_uniq_pd = zh_class_pd.drop_duplicates(subset=['class', 'example'])
zh_class_uniq_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1147 entries, 0 to 1214
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   key       1147 non-null   object 
 1   relation  1147 non-null   object 
 2   example   1147 non-null   object 
 3   language  1147 non-null   object 
 4   concat    1147 non-null   object 
 5   weight    1147 non-null   float64
 6   class     1147 non-null   object 
dtypes: float64(1), object(6)
memory usage: 71.7+ KB


In [12]:
en_class_uniq_pd = en_class_pd.drop_duplicates(subset=['class', 'example'])
en_class_uniq_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5496 entries, 0 to 5797
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   key       5496 non-null   object 
 1   relation  5496 non-null   object 
 2   example   5496 non-null   object 
 3   language  5496 non-null   object 
 4   concat    5496 non-null   object 
 5   weight    5496 non-null   float64
 6   class     5496 non-null   object 
dtypes: float64(1), object(6)
memory usage: 343.5+ KB


In [62]:
!pip install xlsxwriter
import xlsxwriter

zh_class_uniq_pd.to_excel('./zh_class_noun.xlsx', engine='xlsxwriter')
en_class_uniq_pd.to_excel('./en_class_noun.xlsx', engine='xlsxwriter')

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
