In [1]:
#研究機関"Institution_Name"がカバーしている研究分野を，
#ASJC（All Science Journal Classification）とQSランキングで使用されている分類（ASJCを使用）の2種の分類について調べる．
#一つの論文が複数のASJCに対応づけられていることもあるので，"; "区切りで入力されていたASJC列をユニークにして取り出し配列を作る
#この研究機関の例では，配列の要素数は136．（全ASJCコードの半数以下）
import pandas as pd
import numpy as np
institute = 'Institution_Name'
directory = '2019Jan_2013_2017/paper/'
file_in = directory + institute + '.csv'
asjc = pd.read_csv(file_in, skiprows=11, encoding = 'UTF-8')
asjc.loc[:, ['Title','Authors', 'Year', 'Citations', 'Field-Weighted Citation Impact', 'All Science Journal Classification (ASJC)']].head(10)
asjc.dtypes
asjc['asjc_incl'] = asjc['All Science Journal Classification (ASJC)'].astype(str)
asjc_code = asjc['asjc_incl'].map(lambda x: x.split('; '))
ser = pd.Series(np.hstack(asjc_code.values))
unique_asjc = ser.str.strip().unique()
unique_asjc.sort()
unique_asjc
len(unique_asjc)
#asjc_array = np.delete(unique_asjc, [0, 267], 0)
asjc_array = np.delete(unique_asjc, 136, 0)
asjc_array
len(asjc_array)

136

In [2]:
#元の論文リストから，ASJCごとのデータを抽出したのち，結合
def filter_df_by_asjc(df, asjc_code):
    asjc_df = df.loc[df['asjc_incl'].map(lambda x: asjc_code in x)].copy()
    asjc_df['asjc_incl'] = asjc_code
    return asjc_df

asjc_df_list = [filter_df_by_asjc(asjc, asjc_code) for asjc_code in asjc_array]
asjc_vert = pd.concat(asjc_df_list)
asjc_vert.reset_index(inplace=True)
new = asjc_vert['asjc_incl'].str.split('-', n = 1, expand = True)
asjc_vert['Code_tmp'] = new[0]
asjc_vert['Description'] = new[1]
asjc_vert.drop(columns = ['asjc_incl'], inplace = True)

In [3]:
#QSでの分類の定義（ASJCとの対応）が書かれたデータフレームとマージ
#ASJCについては，そのコードでクロス集計し，多い順にソート．カウント数の減り具合から，研究分野の多様性を判断
#QSの分類については，異なったASJCが同じSubject に分類されていることもあるので，ドロップした後でクロス集計．
#分野ごとの散布図を想定しているので，多い順でのソートはなし．
file_out_asjc = directory + institute + '_asjc.csv'
file_out_qs = directory + institute + '_qs.csv'
#  definition of QS Cllasification
qs = pd.read_csv('ASJC_Subject.csv', encoding = 'UTF-8')
qs.columns = ['ASJC_0', 'Area_QS', 'Subject_QS', 'Num Rec', 'ASJC_1', 'Code', 'Class Name']
qs
qs.index
#qs.columns
#qs.dtypes
# apply the QS Classification to the list
asjc_vert['Code'] = asjc_vert['Code_tmp'].astype(int)
asjc_qs = pd.merge(asjc_vert, qs, on='Code', how='left')
asjc_qs['Institution'] = institute
#
diversity_asjc = asjc_qs.groupby(['Institution', 'Code', 'Description']).agg({'Citations': 'describe', 'Field-Weighted Citation Impact':'mean'})
diversity_asjc.sort_values([('Citations', 'count')], ascending = False, inplace = True)
diversity_asjc.reset_index(inplace=True)
diversity_asjc.columns = ['Institution', 'Code', 'Description', 'Count', 'Citations_mean', 'Citations_std', 'Citations_min', 'Citations_25%', 'Citations_50%', 'Citations_75%', 'Citations_max', 'FWCI_mean']
##diversity_asjc.loc[:,['Institution', 'Code', 'Description', 'Count', 'Citations_mean', 'FWCI_mean']]
diversity_asjc.round(2).to_csv(file_out_asjc, mode = 'a', header = True, columns = ['Institution', 'Code', 'Description','Count', 'Citations_mean', 'FWCI_mean'])
#
asjc_qs.duplicated(subset = ['DOI', 'Subject_QS'])
asjc_qs.drop_duplicates(subset = ['DOI', 'Subject_QS'])
diversity_qs = asjc_qs.groupby(['Institution', 'Area_QS', 'Subject_QS']).agg({'Citations':'describe', 'Field-Weighted Citation Impact':'mean'})
diversity_qs.columns = ['Count', 'Citation_mean', 'Citation_std', 'Citation_min', 'Citation_25%', 'Citation_50%', 'Citation_75%', 'Citation_max', 'FWCI_mean']
diversity_qs
diversity_qs.round(2).to_csv(file_out_qs, mode = 'a', header = True, columns = ['Count', 'Citation_mean', 'FWCI_mean'])