In [9]:
import os
import re
import nltk
import tqdm
import jieba
import neologdn
import itertools
import pandas as pd

import polars as pl

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import ftlangdetect
from janome.tokenizer import Tokenizer
from pypinyin import lazy_pinyin
from pykakasi import kakasi
from gensim.models import word2vec
from gensim.models.callbacks import CallbackAny2Vec

kks = kakasi()
import regex
ps =PorterStemmer()
tk = Tokenizer()

os.makedirs("../test_data/", exist_ok=True)
nltk.download('stopwords')
stopwords = ['at', 'based', 'in', 'of', 'for', 'on', 'and', 'to', 'an', 'using', 'with', 'the', 'by', 'we', 'be', 'is', 'are', 'can'] + stopwords.words('english')+ stopwords.words('spanish')+ stopwords.words('german')
stopwords_extend = ['university', 'univ', 'china', 'department', 'dept', 'laboratory', 'lab',
                    'school', 'al', 'et', 'institute', 'inst', 'college', 'chinese', 'beijing',
                    'journal', 'science', 'international', 'key', 'sciences', 'research',
                    'academy', 'state', 'center','key','univers','scienc','depart','institut','laboratori']

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
check_list = []
def clean_text(txt):
    if txt != None:
        puncs = '[!�“”"#$%&\'()【】（）／《》\・*+,-./–:;<=>?@[\\]^_`{|}~—～’、。]+'
        txt = txt.strip()
        txt = txt.lower()
        txt = txt.replace('\n', ' ')
        txt = txt.replace(u"\xa0", u" ")
        txt = txt.replace('\\',' ')
        txt = txt.replace('‐',' ')
        txt = re.sub(puncs, ' ', txt)
        txt = re.sub(r'\s{2,}', ' ', txt).strip()
        return txt
    else:
        return ''

def split_text(txt):
    if txt == 'null' or txt == '':
        return []
    else:
        txt = txt.split(' ')
        txt = [word for word in txt if word not in stopwords]
        txt = [word for word in txt if word.isdigit() != True]
        txt = [re.sub(r'\d+', '0', word) for word in txt]
        txt = [ps.stem(word) for word in txt]
        txt = [word for word in txt if word != '' and len(word) > 1]
        return txt

def split_list(lst):
    if lst == ['null']:
        return []
    else:
        lst = [word for word in lst if word not in stopwords]
        lst = [word for word in lst if word.isdigit() != True]
        lst = [re.sub(r'\d+', '0', word) for word in lst]
        lst = [ps.stem(word) for word in lst]
        lst = [word for word in lst if word != '' and len(word) > 1]
        return lst
    
def clean_authors(authors):
    cleaned_authors, names_list, orgs_list = [], [], []
    if len(authors) > 0:
        for author in authors:
            name = author.get('name')
            if name != '':
                name = clean_text(name)
                ptn = r'[\p{Block=Hiragana}\p{Script=Katakana}\p{Han}\p{Script_Extensions=Han}\u2E80-\u2FDF\u3005-\u3007\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\U00020000-\U0002EBEF]+'
                re_ptn = regex.compile(ptn)
                # 漢字・ひらがな・カタカナが存在する場合 =>日本人or中国人？
                if re_ptn.search(name):
                    # 英語削除（漢字 + ピンイン表記の入力が多い・どちらもフルネーム）
                    re_txt = re_ptn.finditer(name)
                    temp = []
                    for t in re_txt:
                        temp.append(t.group())
                    name = ' '.join(temp)
                    check_list.append(name)
                    # jp or zh
                    lang = ftlangdetect.detect(name, low_memory=False)['lang']
                    if lang == 'zh':
                        name = name.removesuffix(' 著')
                        # 複数人の入力がある場合の対処
                        if len(name) > 4 and len(name.split(' ')) > 2:
                            tmp = []
                            for n in name.split(' '):
                                n = ' '.join(lazy_pinyin(n))
                                n = re.sub(r'\s{2,}', ' ', n).strip()
                                n = split_name(n)
                                tmp.extend(n)
                            name = tmp
                        else:            
                            name = ' '.join(lazy_pinyin(name))
                            name = re.sub(r'\s{2,}', ' ', name).strip()
                            name = split_name(name)
                    # jaは精度甘め
                    elif lang == 'ja':
                        if is_zh(name) or len(name) < 4:
                            name = ' '.join(lazy_pinyin(name))
                            name = re.sub(r'\s{2,}', ' ', name).strip()
                            name = split_name(name)
                        else:
                            # ローマ字表記変換
                            jps = []
                            result = kks.convert(name)
                            for w in result:
                                if w['hepburn'] != ' ':
                                    jps.append(w['hepburn'])
                            name_1 = ' '.join(jps)
                            name_1 = re.sub(r'\s{2,}', ' ', name_1).strip()
                            name = split_name(name_1)
                else:
                    name = split_name(name)
            else:
                name = ['NULL']
            names_list.extend(name)
            
            org = author.get('org')
            if org != '':
                org = clean_text(org)
                org = org.split(' ')
                org = [word for word in org if word not in stopwords]
                org = [word for word in org if word not in stopwords_extend]
                org = [word for word in org if word.isdigit() != True]
                org = [re.sub(r'\d+', '0', word) for word in org]
                org = [ps.stem(word) for word in org]
                org = [word for word in org if word != '' and len(word) > 1]
                if len(org) == 0:
                    org = ['NULL']
            else:
                org = ['NULL']
            orgs_list.extend(org)
            name = ','.join(list(set(name)))
            org = ','.join(list(set(org)))
            cleaned_authors.append([name, org])
    return cleaned_authors, list(set(names_list)), list(set(orgs_list))

def clean_year(year):
    if year == 0 or year == '':
        return None
    else:
        return int(year)

def split_name(name):
    name = name.split(' ')
    name = [n for n in name if n.isdigit() != True]
    if len(name) == 2:
        name = [
            name[0] + name[1],
            # name[0][0] + '_' + name[1],
            name[1] + name[0],
            # name[1][0] + '_' + name[0]
        ]
        name.sort()
        name = name[0]
    elif len(name) > 2:
        name = [
            name[0] + name[1] + name[2],
            # name[0][0] + '_' + name[1][0] + '_' + name[2],
            name[2] + name[1] + name[0],
            # name[2][0] + '_' + name[1][0] + '_' + name[0],
            # name[0][0] + '_' + name[1][0] + name[2],
            # name[2][0] + '_' + name[1][0] + name[0],
            # name[0] + '_'  + name[2],
            # name[0][0] + '_'  + name[2],
            # name[2] + '_' + name[0],
            # name[2][0] + '_' + name[0],
        ]
        name.sort()
        name = name[0]
    elif len(name) == 1 and name != ['']:
        name = name[0]
    else:
        name = 'NULL'
    return [name]
    
def clean_jp(lst):
    lst = [neologdn.normalize(word) for word in lst]
    lst = list(itertools.chain.from_iterable([list(tk.tokenize(txt, wakati=True)) for word in lst]))
    lst = [word for word in lst if word != '']
    return lst
    
def clean_zh(lst):
    lst = list(itertools.chain.from_iterable([jieba.lcut(word) for word in lst]))
    lst = [word for word in lst if word != '']
    return lst

def judge_lang(txt, lst):
    ptn = r'[\p{Block=Hiragana}\p{Script=Katakana}\p{Script_Extensions=Han}\u2E80-\u2FDF\u3005-\u3007\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\U00020000-\U0002EBEF]+'
    re_ptn = regex.compile(ptn)
    if len(lst) == 0:
        lang = 'Nothing'
    else:
        re_txt = re_ptn.search(txt)
        if re_txt != None:
            lang = ftlangdetect.detect(re_txt.group(), low_memory=False)['lang']
            if lang == 'jp':
                lst = clean_jp(lst)
            elif lang == 'zh':
                lst = clean_zh(lst)
        else:
            lang = ftlangdetect.detect(txt, low_memory=False)['lang']
            if lang not in ['en','de','zh','fr','es','ru','it','Nothing']:
                lang = 'Other'
    return lst, lang
    
def is_zh(in_str):
    """
    >>> is_zh(u'おはよう')
    False
    >>> is_zh(u'&#35828;地')
    True
    """
    questions_before = [s for s in in_str]
    questions_gb2312 = [s for s in \
        in_str.encode('gb2312','ignore').decode('gb2312')]
    questions_cp932 = [s for s in \
        in_str.encode('cp932','ignore').decode('cp932')]
    if (questions_gb2312 == questions_before) and (
        (set(questions_before) - set(questions_cp932)) != set([])):
        return True
    else:
        return False

# pid_to_info_all.json

In [11]:
path = "../raw/pid_to_info_all.json"
df = pd.read_json(path)
df = df.T
df = df.reset_index(drop=True)
df.head()

In [88]:
cleaned_data = []
for index, row in tqdm.tqdm(df.iterrows()):
    p_id = row['id']
    # 前処理&リスト化&日本語・中国語対応：title
    row['title'] = clean_text(row['title'])
    title = split_text(row['title'])
    title, t_lang = judge_lang(row['title'], title)
    
    # 前処理&リスト化&日本語・中国語対応：abstract
    row['abstract'] = clean_text(row['abstract'])
    abstract = split_text(row['abstract'])
    abstract, a_lang = judge_lang(row['abstract'], abstract) 
    
    # 前処理&リスト化&日本語・中国語対応：keywords
    row['keywords'] = list(itertools.chain.from_iterable(
        [clean_text(key).strip().split(" ") for key in row['keywords']]))
    keywords = split_list(row['keywords'])
    keywords, k_lang = judge_lang(' '.join(row['keywords']), keywords)
    
    # 前処理&リスト化&日本語・中国語対応：venue
    row['venue'] = clean_text(row['venue'])
    venue = split_text(row['venue'])
    venue, v_lang = judge_lang(row['venue'], venue)

    #前処理&リスト化・著者数:authors
    authors,names,orgs = clean_authors(row['authors'])
    num_authors = len(list(itertools.chain.from_iterable([a[0] for a in authors if a[0] != ''])))
    #前処理:year
    year = clean_year(row['year'])
    
    cleaned_data.append([
        row['id'],
        title,
        authors,
        abstract,
        keywords,
        venue,
        year,
        names,
        orgs,
        num_authors,
        t_lang,
        a_lang,
        k_lang,
        v_lang
    ])

317302it [34:54, 151.49it/s]


In [89]:
df_fix = pl.DataFrame(cleaned_data, schema=['id','title','authors','abstract','keywords','venue','year','names','orgs','num_authors','t_lang','a_lang','k_lang', 'v_lang'])
df_fix.write_parquet('../test_data/cleaned_pid_to_info_all_v6.parquet')
df_fix.head()

id,title,authors,abstract,keywords,venue,year,names,orgs,num_authors,t_lang,a_lang,k_lang,v_lang
str,list[str],list[list[str]],list[str],list[str],list[str],i64,list[str],list[str],i64,str,str,str,str
"""6IsfnuWU""","[""probabilist"", ""skylin"", … ""window""]","[[""wenjiezhang"", ""sydney,unsw""], [""linxuemin"", ""sydney,unsw""], … [""jeffreyxuyu"", ""hong,kong""]]","[""skylin"", ""comput"", … ""time""]","[""continu"", ""skylin"", … ""window""]","[""icd"", ""proceed"", … ""engin""]",2009,"[""yingzhang"", ""linxuemin"", … ""wangwei""]","[""sydney"", ""hong"", … ""kong""]",47,"""en""","""en""","""en""","""en"""
"""8B8GhlnI""","[""editori"", ""knowledg"", … ""environ""]","[[""chenliming"", ""NULL""], [""chrisnugent"", ""comput""], … [""yuzhiwen"", ""NULL""]]",[],"[""activ"", ""recognit""]","[""period""]",2011,"[""yuzhiwen"", ""chenliming"", … ""cookdiane""]","[""comput"", ""NULL""]",38,"""en""","""Nothing""","""en""","""en"""
"""4dZKGwVR""","[""subscrib"", ""assign"", … ""subscrib""]","[[""albertyu"", ""sci,comp,usa,durham,nc,duke""], [""agarwalkpankaj"", ""sci,comp,usa,durham,nc,duke""], [""junyang"", ""sci,comp,usa,durham,nc,duke""]]","[""studi"", ""problem"", … ""subscrib""]","[""mont"", ""carlo"", … ""assign""]","[""icd"", ""proceed"", … ""engin""]",2011,"[""agarwalkpankaj"", ""junyang"", ""albertyu""]","[""sci"", ""comp"", … ""duke""]",29,"""en""","""en""","""en""","""en"""
"""V1JgT3OM""","[""tree"", ""mine"", … ""meet""]","[[""yuzhiwen"", ""northwestern,xi,polytechn""], [""yuzhiyong"", ""fuzhou""], … [""nakamurayuichi"", ""kyoto""]]","[""abstractdiscov"", ""semant"", … ""interact""]","[""discov"", ""pattern"", … ""analysi""]","[""period""]",2012,"[""beckerchristian"", ""nakamurayuichi"", … ""xingshezhou""]","[""mannheim"", ""northwestern"", … ""kyoto""]",57,"""en""","""en""","""en""","""en"""
"""HMvrPr2W""","[""protein"", ""function"", … ""classif""]","[[""guoxianyu"", ""technolog,south,guangzhou,southwest,beibei""], [""huzefarangwala"", ""georg,fairfax,mason""], … [""yuzhiwen"", ""south,technolog,guangzhou""]]","[""abstracthigh"", ""throughput"", … ""kernel""]","[""heterogen"", ""proteom"", … ""vector""]","[""ieee"", ""acm"", … ""bioinformat""]",2013,"[""carlottadomeniconi"", ""guojizhang"", … ""yuzhiwen""]","[""technolog"", ""south"", … ""beibei""]",59,"""en""","""en""","""en""","""en"""


# W2V

In [90]:
df_fix = pd.read_parquet('../test_data/cleaned_pid_to_info_all_v6.parquet')
print(df_fix.shape)
df_fix.head(2)

(317302, 14)


Unnamed: 0,id,title,authors,abstract,keywords,venue,year,names,orgs,num_authors,t_lang,a_lang,k_lang,v_lang
0,6IsfnuWU,"[probabilist, skylin, oper, slide, window]","[[wenjiezhang, sydney,unsw], [linxuemin, sydne...","[skylin, comput, mani, applic, includ, multi, ...","[continu, skylin, queri, probabilist, skylin, ...","[icd, proceed, ieee, intern, confer, data, engin]",2009.0,"[yingzhang, linxuemin, jeffreyxuyu, wenjiezhan...","[sydney, hong, unsw, kong]",47,en,en,en,en
1,8B8GhlnI,"[editori, knowledg, driven, activ, recognit, i...","[[chenliming, NULL], [chrisnugent, comput], [c...",[],"[activ, recognit]",[period],2011.0,"[yuzhiwen, chenliming, chrisnugent, cookdiane]","[comput, NULL]",38,en,Nothing,en,en


In [94]:
corpus = []
for index, row in tqdm.tqdm(df_fix.iterrows()):
    temp_corpus = []
    temp_corpus.extend(row['title'])
    temp_corpus.extend(row['abstract'])
    temp_corpus.extend(row['keywords'])
    if row['venue'] is not None:
        temp_corpus.extend(row['venue'])
    if row['year'] > 0: 
        temp_corpus.extend([str(int(row['year']))])
    if len(temp_corpus) > 1:
        corpus.append(temp_corpus)

317302it [00:42, 7437.87it/s]


In [33]:
class LossLogger(CallbackAny2Vec):
    '''Callback to log loss after each epoch.'''
    def __init__(self):
        self.epoch = 0
        self.loss_previous_step = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_previous_step
        self.loss_previous_step = loss
        self.epoch += 1
        print(f"Loss after epoch {self.epoch}: {loss_now}")

model = word2vec.Word2Vec(corpus, vector_size=128, min_count=2, window=10, negative=5, epochs=30, sg=0, 
                          compute_loss=True,callbacks=[LossLogger()]
                         )
model.save('../test_data/w2v_concat_cbow_128dim_min2_window10_neg5_epoch30_v6.bin')
model.wv.most_similar('network')

Loss after epoch 1: 14223800.0
Loss after epoch 2: 10850790.0
Loss after epoch 3: 9550474.0
Loss after epoch 4: 6575896.0
Loss after epoch 5: 6562656.0
Loss after epoch 6: 6587584.0
Loss after epoch 7: 6593000.0
Loss after epoch 8: 6210856.0
Loss after epoch 9: 1746176.0
Loss after epoch 10: 1714200.0
Loss after epoch 11: 1691856.0
Loss after epoch 12: 1674488.0
Loss after epoch 13: 1641664.0
Loss after epoch 14: 1624768.0
Loss after epoch 15: 1576440.0
Loss after epoch 16: 1582600.0
Loss after epoch 17: 1549176.0
Loss after epoch 18: 1520848.0
Loss after epoch 19: 1484792.0
Loss after epoch 20: 1443536.0
Loss after epoch 21: 1413440.0
Loss after epoch 22: 1341088.0
Loss after epoch 23: 1303208.0
Loss after epoch 24: 1266968.0
Loss after epoch 25: 1211408.0
Loss after epoch 26: 1145200.0
Loss after epoch 27: 1066416.0
Loss after epoch 28: 981016.0
Loss after epoch 29: 888112.0
Loss after epoch 30: 814536.0


[('graph', 0.5929703116416931),
 ('commun', 0.5862277150154114),
 ('connect', 0.5703359246253967),
 ('architectur', 0.5659001469612122),
 ('traffic', 0.5612089037895203),
 ('node', 0.5335066318511963),
 ('system', 0.5230070948600769),
 ('internet', 0.5210995078086853),
 ('link', 0.517693281173706),
 ('rout', 0.4998023509979248)]

In [97]:
author_corpus = []
for index, row in tqdm.tqdm(df_fix.iterrows()):
    for r in row['authors']:
        orgs = r[1].split(',')
        if orgs != ['NULL'] and len(orgs)>1:
            author_corpus.append(orgs)

317302it [00:27, 11620.45it/s]


In [98]:
o_model = word2vec.Word2Vec(author_corpus, vector_size=128, min_count=5, window=5, negative=5, epochs=30, sg=0)
o_model.save('../test_data/w2v_org_cbow_128dim_window5_min5_neg5_epoch30_v6.bin')
o_model.wv.most_similar('univ')

[('siti', 0.7422759532928467),
 ('wuahan', 0.6284927129745483),
 ('acadami', 0.3536508083343506),
 ('haidian', 0.34424933791160583),
 ('neural', 0.33910518884658813),
 ('hung', 0.3367649018764496),
 ('eal', 0.3324764668941498),
 ('safti', 0.31024035811424255),
 ('hongshan', 0.30843672156333923),
 ('centenari', 0.28941377997398376)]

# train_author.json/ind_valid_author.json

In [3]:
df_train_org = pd.read_json('../raw/train_author.json')
df_train_org = df_train_org.T
df_train_org = df_train_org.reset_index()
df_train_org.columns = ['id', 'name', 'normal_data', 'outliers']

df_test_org = pd.read_json('../raw/ind_test_author_filter_public.json')
df_test_org = df_test_org.T
df_test_org = df_test_org.reset_index()
df_test_org.columns = ['id', 'name', 'papers']
print(df_train_org.shape)
display(df_train_org.head())
print(df_test_org.shape)
display(df_test_org.head())

(779, 4)


Unnamed: 0,id,name,normal_data,outliers
0,Iki037dt,atsushi ochiai,"[YzOCpPTO, AblgcGjH, B5aouLse, u1G7wBEv, W7w6P...","[XL3wd3CP, BTKTiJp2, JxSjl5xc, 0jyMLgRt, uHWx8..."
1,ZihzMro7,mingwu yang,"[C58t0yYu, sWIRnfR3, HJW8h2mo, 0Ptx4O5n, fU4vB...","[qK8llKzD, I0eTdaAG, nFebDDiR, 903CyaNQ, Q45WM..."
2,WXMYBk3c,jianzhao huang,"[lJAIOXO4, fYJcce0K, ZaeOFAcI, kg9xDSXm, T37S3...","[HwaUxOes, nvELwvhl, 6Z6SRTQh, R1yeZqOY, qnwco..."
3,WrCODHhe,xuebiao yao,"[3fYoJb1W, wjt8Y8ho, pPx6o7KZ, xgRarLPn, 9w9yz...","[OtmIuFFb, wnP8OmXf, IZ1qVc9S, YccNQrlZ, sLO7c..."
4,k3uSCGEE,shunlin tang,"[gTeQer76, mVk2vmmN, TLKSll8D, Eg5NcmZ2, kM5Ip...","[xPmu4CGB, buwfccml, fBPzgpof, HgjM9QKW, rPk5S..."


(515, 3)


Unnamed: 0,id,name,papers
0,Fkb16wn7,jitendra malik,"[0DchSY2n, 0Gw6iDes, 0PgoDSAP, 0S7g2B2l, 0YJjx..."
1,KKiBE172,hongbo xin,"[0ezX6FSp, 3iqa5cb0, 4N47MJgM, 4oIu3mlO, 4tXA4..."
2,grbM72Lg,junjie mao,"[0fE5PVDQ, 13lxiUDV, 1K5HU0Iy, 20BOekkh, 2d0LD..."
3,Xrp9GO54,xiaoxia wan,"[2YBSYv1q, 3IOmTqw5, 60oLEMcv, 7VGBEWQB, 7xeeg..."
4,9Gs8Wj3Y,tie gang,"[05R8WTSY, 06xOWmCZ, 0Ac7puFw, 0HUElSew, 0HpAa..."


In [4]:
df = pd.read_parquet('../test_data/cleaned_pid_to_info_all_v6.parquet')
df.head()

Unnamed: 0,id,title,authors,abstract,keywords,venue,year,names,orgs,num_authors,t_lang,a_lang,k_lang,v_lang
0,6IsfnuWU,"[probabilist, skylin, oper, slide, window]","[[wenjiezhang, sydney,unsw], [linxuemin, sydne...","[skylin, comput, mani, applic, includ, multi, ...","[continu, skylin, queri, probabilist, skylin, ...","[icd, proceed, ieee, intern, confer, data, engin]",2009.0,"[yingzhang, linxuemin, jeffreyxuyu, wenjiezhan...","[sydney, hong, unsw, kong]",47,en,en,en,en
1,8B8GhlnI,"[editori, knowledg, driven, activ, recognit, i...","[[chenliming, NULL], [chrisnugent, comput], [c...",[],"[activ, recognit]",[period],2011.0,"[yuzhiwen, chenliming, chrisnugent, cookdiane]","[comput, NULL]",38,en,Nothing,en,en
2,4dZKGwVR,"[subscrib, assign, wide, area, content, publis...","[[albertyu, sci,comp,usa,durham,nc,duke], [aga...","[studi, problem, assign, subscrib, broker, wid...","[mont, carlo, approxim, algorithm, futur, eval...","[icd, proceed, ieee, 0th, intern, confer, data...",2011.0,"[agarwalkpankaj, junyang, albertyu]","[sci, comp, usa, durham, nc, duke]",29,en,en,en,en
3,V1JgT3OM,"[tree, mine, discov, pattern, human, interact,...","[[yuzhiwen, northwestern,xi,polytechn], [yuzhi...","[abstractdiscov, semant, knowledg, signific, u...","[discov, pattern, interact, flow, pattern, tre...",[period],2012.0,"[beckerchristian, nakamurayuichi, yuzhiyong, y...","[mannheim, northwestern, xi, polytechn, fuzhou...",57,en,en,en,en
4,HMvrPr2W,"[protein, function, predict, multi, label, ens...","[[guoxianyu, technolog,south,guangzhou,southwe...","[abstracthigh, throughput, experiment, techniq...","[heterogen, proteom, data, set, multilabel, en...","[ieee, acm, transact, comput, biolog, bioinfor...",2013.0,"[carlottadomeniconi, guojizhang, guoxianyu, hu...","[technolog, south, guangzhou, mason, southwest...",59,en,en,en,en


In [8]:
def clean_name(name):
    name = clean_text(name)
    name = name.split(' ')
    name = [n for n in name if n.isdigit() != True]
    if len(name) == 2:
        name = [
            name[0] + name[1],
            # name[0][0] + '_' + name[1],
            name[1] + name[0],
            # name[1][0] + '_' + name[0]
        ]
        name.sort()
        return name[0]
    elif len(name) > 2:
        name = [
            name[0] + name[1] + name[2],
            # name[0][0] + '_' + name[1][0] + '_' + name[2],
            name[2] + name[1] + name[0],
            # name[2][0] + '_' + name[1][0] + '_' + name[0],
            # name[0][0] + '_' + name[1][0] + name[2],
            # name[2][0] + '_' + name[1][0] + name[0],
            # name[0] + '_'  + name[2],
            # name[0][0] + '_'  + name[2],
            # name[2] + '_' + name[0],
            # name[2][0] + '_' + name[0],
        ]
        name.sort()
        return name[0]
    elif len(name) == 1 and name != ['']:
        return name[0]
    else:
        return 'NULL'
train_list,test_list = [],[]
for index, row in df_train_org.iterrows():
    row['name'] = clean_name(row['name'])
    train_list.append(row)
for index, row in df_test_org.iterrows():
    row['name'] = clean_name(row['name'])
    test_list.append(row)

In [9]:
train = pl.from_pandas(pd.DataFrame(train_list))
train.write_parquet('../test_data/train_author.parquet')
test = pl.from_pandas(pd.DataFrame(test_list))
test.write_parquet('../test_data/ind_test_author_filter_public.parquet')