In [2]:
import os
import re
import nltk
import tqdm
import jieba
import regex
import neologdn
import itertools
import pandas as pd
import polars as pl
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import ftlangdetect
from janome.tokenizer import Tokenizer
from pypinyin import lazy_pinyin
from pykakasi import kakasi

os.makedirs("../test_data/", exist_ok=True)

kks = kakasi()
ps =PorterStemmer()
tk = Tokenizer()

nltk.download('stopwords')
stopwords = ['at', 'based', 'in', 'of', 'for', 'on', 'and', 'to', 'an', 'using', 'with', 'the', 'by', 'we', 'be', 'is', 'are', 'can'] + stopwords.words('english')+ stopwords.words('spanish')+ stopwords.words('german')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
check_list = []
def clean_text(txt):
    if txt != None:
        puncs = '[!�“”"#$%&\'()【】（）／《》\・*+,-./–:;<=>?@[\\]^_`{|}~—～’、。]+'
        txt = txt.strip()
        txt = txt.lower()
        txt = txt.replace('\n', ' ')
        txt = txt.replace(u"\xa0", u" ")
        txt = txt.replace('\\',' ')
        txt = txt.replace('‐',' ')
        txt = re.sub(puncs, ' ', txt)
        txt = re.sub(r'\s{2,}', ' ', txt).strip()
        return txt
    else:
        return ''

def split_text(txt):
    if txt == 'null' or txt == '':
        return []
    else:
        txt = txt.split(' ')
        txt = [re.sub(r'\d+', '0', word) for word in txt]
        return txt

def split_list(lst):
    if lst == ['null']:
        return []
    else:
        lst = [re.sub(r'\d+', '0', word) for word in lst]
        return lst
    
def clean_authors(authors):
    cleaned_authors, names_list, orgs_list = [], [], []
    if len(authors) > 0:
        for author in authors:
            name = author.get('name')
            if name != '':
                name = clean_text(name)
                ptn = r'[\p{Block=Hiragana}\p{Script=Katakana}\p{Han}\p{Script_Extensions=Han}\u2E80-\u2FDF\u3005-\u3007\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\U00020000-\U0002EBEF]+'
                re_ptn = regex.compile(ptn)
                # 漢字・ひらがな・カタカナが存在する場合 =>日本人or中国人？
                if re_ptn.search(name):
                    # 英語削除（漢字 + ピンイン表記の入力が多い・どちらもフルネーム）
                    re_txt = re_ptn.finditer(name)
                    temp = []
                    for t in re_txt:
                        temp.append(t.group())
                    name = ' '.join(temp)
                    check_list.append(name)
                    # jp or zh
                    lang = ftlangdetect.detect(name, low_memory=False)['lang']
                    if lang == 'zh':
                        name = name.removesuffix(' 著')
                        # 複数人の入力がある場合の対処
                        if len(name) > 4 and len(name.split(' ')) > 2:
                            tmp = []
                            for n in name.split(' '):
                                n = ' '.join(lazy_pinyin(n))
                                n = re.sub(r'\s{2,}', ' ', n).strip()
                                n = split_name(n)
                                tmp.extend(n)
                            name = tmp
                        else:            
                            name = ' '.join(lazy_pinyin(name))
                            name = re.sub(r'\s{2,}', ' ', name).strip()
                            name = split_name(name)
                    # jaは精度甘め
                    elif lang == 'ja':
                        if is_zh(name) or len(name) < 4:
                            name = ' '.join(lazy_pinyin(name))
                            name = re.sub(r'\s{2,}', ' ', name).strip()
                            name = split_name(name)
                        else:
                            # ローマ字表記変換
                            jps = []
                            result = kks.convert(name)
                            for w in result:
                                if w['hepburn'] != ' ':
                                    jps.append(w['hepburn'])
                            name_1 = ' '.join(jps)
                            name_1 = re.sub(r'\s{2,}', ' ', name_1).strip()
                            name = split_name(name_1)
                else:
                    name = split_name(name)
            else:
                name = ['NULL']
            names_list.extend(name)
            
            org = author.get('org')
            if org != '':
                org = clean_text(org)
                org = org.split(' ')
                org = [re.sub(r'\d+', '0', word) for word in org]
                if len(org) == 0:
                    org = ['NULL']
            else:
                org = ['NULL']
            orgs_list.extend(org)
            name = ','.join(list(set(name)))
            org = ' '.join(list(set(org)))
            cleaned_authors.append([name, org])
    return cleaned_authors, list(set(names_list)), list(set(orgs_list))

def clean_year(year):
    if year == 0 or year == '':
        return None
    else:
        return int(year)

def split_name(name):
    name = name.split(' ')
    name = [n for n in name if n.isdigit() != True]
    if len(name) == 2:
        name = [
            name[0] + ' ' + name[1],
            name[1] + ' ' + name[0],
        ]
        name.sort()
        name = name[0]
    elif len(name) > 2:
        name = [
            name[0] + ' ' + name[1] + ' ' + name[2],
            name[2] + ' ' +  name[1] + ' ' + name[0],
        ]
        name.sort()
        name = name[0]
    elif len(name) == 1 and name != ['']:
        name = name[0]
    else:
        name = 'NULL'
    return [name]

def clean_jp(lst):
    lst = [neologdn.normalize(word) for word in lst]
    lst = list(itertools.chain.from_iterable([list(tk.tokenize(txt, wakati=True)) for word in lst]))
    lst = [word for word in lst if word != '']
    return lst
    
def clean_zh(lst):
    lst = list(itertools.chain.from_iterable([jieba.lcut(word) for word in lst]))
    lst = [word for word in lst if word != '']
    return lst

def judge_lang(txt, lst):
    ptn = r'[\p{Block=Hiragana}\p{Script=Katakana}\p{Script_Extensions=Han}\u2E80-\u2FDF\u3005-\u3007\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\U00020000-\U0002EBEF]+'
    re_ptn = regex.compile(ptn)
    if len(lst) == 0:
        lang = 'Nothing'
    else:
        re_txt = re_ptn.search(txt)
        if re_txt != None:
            lang = ftlangdetect.detect(re_txt.group(), low_memory=False)['lang']
            if lang == 'jp':
                lst = clean_jp(lst)
            elif lang == 'zh':
                lst = clean_zh(lst)
        else:
            lang = ftlangdetect.detect(txt, low_memory=False)['lang']
            if lang not in ['en','de','zh','fr','es','ru','it','Nothing']:
                lang = 'Other'
    return lst, lang
    
def is_zh(in_str):
    """
    >>> is_zh(u'おはよう')
    False
    >>> is_zh(u'&#35828;地')
    True
    """
    questions_before = [s for s in in_str]
    questions_gb2312 = [s for s in \
        in_str.encode('gb2312','ignore').decode('gb2312')]
    questions_cp932 = [s for s in \
        in_str.encode('cp932','ignore').decode('cp932')]
    if (questions_gb2312 == questions_before) and (
        (set(questions_before) - set(questions_cp932)) != set([])):
        return True
    else:
        return False

# pid_to_info_all.json

In [40]:
path = "../raw/pid_to_info_all.json"
df = pd.read_json(path)
df = df.T
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,authors,abstract,keywords,venue,year
0,6IsfnuWU,Probabilistic Skyline Operator over Sliding Wi...,"[{'name': 'Wenjie Zhang', 'org': 'UNSW Sydney'...",Skyline computation has many applications incl...,"[continuous skyline query, probabilistic skyli...",ICDE '09 Proceedings of the 2009 IEEE Internat...,2009
1,8B8GhlnI,Editorial: Knowledge-Driven Activity Recogniti...,"[{'name': 'Liming Chen', 'org': ''}, {'name': ...",,[activity recognition],Periodicals,2011
2,4dZKGwVR,Subscriber Assignment For Wide-Area Content-Ba...,"[{'name': 'Albert Yu', 'org': 'Duke Univ, Dept...",We study the problem of assigning subscribers ...,"[Monte Carlo approximation algorithm, future e...",ICDE '11 Proceedings of the 2011 IEEE 27th Int...,2011
3,V1JgT3OM,Tree-Based Mining for Discovering Patterns of ...,"[{'name': 'Zhiwen Yu', 'org': 'Northwestern Po...",AbstractDiscovering semantic knowledge is sign...,"[discovering patterns, interaction flow patter...",Periodicals,2012
4,HMvrPr2W,Protein Function Prediction using Multi-label ...,"[{'name': 'Guoxian Yu', 'org': 'Southwest Univ...",AbstractHigh-throughput experimental technique...,"[heterogeneous proteomic data sets, multilabel...",IEEE/ACM Transactions on Computational Biology...,2013


In [46]:
cleaned_data = []
for index, row in tqdm.tqdm(df.iterrows()):
    p_id = row['id']
    # 前処理&リスト化&日本語・中国語対応：title
    row['title'] = clean_text(row['title'])
    title = ' '.join(split_text(row['title']))

    #前処理&リスト化・著者数:authors
    authors,names,orgs = clean_authors(row['authors'])
    
    # 前処理&リスト化&日本語・中国語対応：abstract
    row['abstract'] = clean_text(row['abstract'])
    abstract = split_text(row['abstract'])
    abstract = ' '.join(split_text(row['abstract']))
    
    # 前処理&リスト化&日本語・中国語対応：keywords
    row['keywords'] = [clean_text(keyword) for keyword in row['keywords']]
    keywords = ','.join(row['keywords'])

    # 前処理&リスト化&日本語・中国語対応：venue
    row['venue'] = clean_text(row['venue'])
    venue = ' '.join(split_text(row['venue']))
    cleaned_data.append([
        row['id'],
        title,
        authors,
        abstract,
        keywords,
        venue,
    ])

317302it [03:43, 1417.28it/s]


In [47]:
df = pl.DataFrame(cleaned_data, schema=['id','title','authors','abstract','keywords','venue'])
df.write_parquet('../test_data/cleaned_pid_to_info_all_v6_light.parquet')
print(df.shape)
df.head()

(317302, 6)


id,title,authors,abstract,keywords,venue
str,str,list[list[str]],str,str,str
"""6IsfnuWU""","""probabilistic skyline operator…","[[""wenjie zhang"", ""unsw sydney""], [""lin xuemin"", ""unsw sydney""], … [""jeffrey xu yu"", ""of chinese hong kong university""]]","""skyline computation has many a…","""continuous skyline query,proba…","""icde 0 proceedings of the 0 ie…"
"""8B8GhlnI""","""editorial knowledge driven act…","[[""chen liming"", ""NULL""], [""chris nugent"", ""computing of school""], … [""yu zhiwen"", ""NULL""]]","""""","""activity recognition""","""periodicals"""
"""4dZKGwVR""","""subscriber assignment for wide…","[[""albert yu"", ""nc duke comp durham usa sci dept 0 univ""], [""agarwal k pankaj"", ""nc duke comp durham usa sci dept 0 univ""], [""jun yang"", ""nc duke comp durham usa sci dept 0 univ""]]","""we study the problem of assign…","""monte carlo approximation algo…","""icde 0 proceedings of the 0 ie…"
"""V1JgT3OM""","""tree based mining for discover…","[[""yu zhiwen"", ""northwestern an polytechnical university xi""], [""yu zhiyong"", ""fuzhou university""], … [""nakamura yuichi"", ""university kyoto""]]","""abstractdiscovering semantic k…","""discovering patterns,interacti…","""periodicals"""
"""HMvrPr2W""","""protein function prediction us…","[[""guoxian yu"", ""guangzhou of south technology and southwest china beibei university""], [""huzefa rangwala"", ""george fairfax university mason""], … [""yu zhiwen"", ""guangzhou of south technology china university""]]","""abstracthigh throughput experi…","""heterogeneous proteomic data s…","""ieee acm transactions on compu…"
