In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scopus_handler import ScopusHandler
%matplotlib inline

In [3]:
def preprocess_authors(df):
    _column_name = "Author(s) ID"
    df = df.astype({_column_name: str})
    # nan, "[No author id available]"
    df = df[df[_column_name].notna()]
    df = df[df[_column_name] != "[No author id available]"]
    # Split by ;
    df = __set_num_authors(df)
    df = __set_first_author(df)
    df = __set_last_author(df)
    df = __set_list_authors(df)
    return df

def __set_num_authors(df):
    df["num_authors"] = df["Author(s) ID"].apply(lambda _text: len(_text.split(";")))
    return df

def __set_first_author(df):
    df["first_author"] = df["Author(s) ID"].apply(lambda _text: _text.split(";")[0])
    return df

def __set_last_author(df):
    df["last_author"] = df["Author(s) ID"].apply(lambda _text: _text.split(";")[-1])
    return df

def __set_list_authors(df):
    df["list_authors"] = df["Author(s) ID"].apply(lambda _text: _text.split(";"))
    return df

def preprocess_affiliations(df):
    _column_name = "Affiliations"
    df = df.astype({_column_name: str})
    # nan
    df = df[df[_column_name].notna()]
    # Split by ;
    df["num_affiliations"] = df[_column_name].apply(lambda _text: len(_text.split(";")))
    return df

def preprocess_funding(df):
    import math
    _column_name = "Funding Details"
    df["is_funded"] = df[_column_name].fillna(False)
    df["is_funded"] = df["is_funded"].apply(lambda _text: True if _text != False else _text)
#     df["is_funded"] = df[_column_name].apply(lambda _text: False if math.isnan(_text) else True)
    return df

def preprocess_access_type(df):
    _column_name = "Access Type"
    df["is_open"] = df[_column_name].fillna(False)
    df["is_open"] = df["is_open"].apply(lambda _text: True if _text != False else _text)
#     df["is_funded"] = df[_column_name].apply(lambda _text: False if math.isnan(_text) else True)
    return df

def preprocess_df(df):
    # 저자 수, 기관 수: 같은 수
    # 국제 집필 여부, 펀드 유무, 공개 논문 여부: 같은 여부
    # 동일 저널, 동일 문서 타입, 동일 토픽: 같은 분류
    df = df.drop_duplicates(subset=["DOI"])
    df = preprocess_authors(df)
    df = preprocess_affiliations(df)
    df = preprocess_funding(df)
    df = preprocess_access_type(df)
    return df

In [2]:
df3 = pd.read_csv("/home/hweem/git/mastersdegree/ytcrawl/customs/scopus/scopus_2014_comp.csv")

In [4]:
df3 = preprocess_df(df3)

In [17]:
df3_targets = df3.sample(int(len(df3) * 0.01))
df3_counters = df3.drop(df3_targets.index)

_cnt_found = 0
for _i, (_idx, _row) in enumerate(df3_targets.iterrows()):
    print(f"[+]{_i + 1} of {len(df3_targets)}...")
    if df3_counters["list_authors"].apply(lambda _list: _row["first_author"] in _list).any():
        print("\tFound!")
        _cnt_found += 1
print(f"{_cnt_found} out of {len(df3_targets)} found.")

[+]1 of 93...
[+]2 of 93...
	Found!
[+]3 of 93...
[+]4 of 93...
[+]5 of 93...
[+]6 of 93...
[+]7 of 93...
[+]8 of 93...
[+]9 of 93...
[+]10 of 93...
	Found!
[+]11 of 93...
	Found!
[+]12 of 93...
	Found!
[+]13 of 93...
	Found!
[+]14 of 93...
[+]15 of 93...
	Found!
[+]16 of 93...
[+]17 of 93...
[+]18 of 93...
[+]19 of 93...
[+]20 of 93...
[+]21 of 93...
[+]22 of 93...
[+]23 of 93...
[+]24 of 93...
[+]25 of 93...
[+]26 of 93...
[+]27 of 93...
[+]28 of 93...
[+]29 of 93...
	Found!
[+]30 of 93...
[+]31 of 93...
[+]32 of 93...
[+]33 of 93...
	Found!
[+]34 of 93...
[+]35 of 93...
	Found!
[+]36 of 93...
[+]37 of 93...
[+]38 of 93...
[+]39 of 93...
[+]40 of 93...
[+]41 of 93...
	Found!
[+]42 of 93...
[+]43 of 93...
[+]44 of 93...
[+]45 of 93...
[+]46 of 93...
[+]47 of 93...
[+]48 of 93...
[+]49 of 93...
[+]50 of 93...
[+]51 of 93...
	Found!
[+]52 of 93...
[+]53 of 93...
	Found!
[+]54 of 93...
[+]55 of 93...
[+]56 of 93...
[+]57 of 93...
[+]58 of 93...
[+]59 of 93...
[+]60 of 93...
[+]61 of 93..