In [43]:
import pandas as pd
import copy

pd.set_option('display.max_columns', None)
data = pd.read_csv("../data/ai_research_papers.csv")
data.at[222653, 'author_names'] = 'Dudík, Miroslav;Haghtalab, Nika;Luo, Haipeng;Schapire, Robert E.;Syrgkanis, Vasilis;Vaughan, Jennifer Wortman'
data_exploded = data.assign(author_afids=data['author_afids'].str.split(';'), author_ids=data['author_ids'].str.split(';'), author_names = data['author_names'].str.split(';'))
data_exploded = data_exploded.explode(['author_afids', 'author_names', 'author_ids'])
data_exploded['author_rank'] = data_exploded.groupby('eid').cumcount() + 1
data_exploded = data_exploded[['eid', 'year', 'origin_ref','author_count', 'author_names', 'author_ids', 'author_afids', 'author_rank', 'citedby_count']]

aff_ids_1 = pd.read_csv("../data/ai_papers_affiliations.csv")
aff_ids_2 = pd.read_csv("../data/ai_papers_affiliations_append.csv")
aff_ids_2 = aff_ids_2[['afid', 'affiliation_name', 'org_type', 'city', 'state', 'country']].drop_duplicates()
aff_ids = pd.concat([aff_ids_1, aff_ids_2])
aff_ids = aff_ids.loc[aff_ids['country'].notna()].drop_duplicates(subset = 'afid')

data_fin = pd.merge(data_exploded, aff_ids, how = 'inner', left_on = 'author_afids', right_on = 'afid')
data_fin = data_fin.sort_values(["year","eid", "author_rank"])

venue_mean = data.groupby("origin_ref")["citedby_count"].mean().reset_index("origin_ref").sort_values("citedby_count", ascending= False)

data_dict = {}
for ind, row in data_fin.iterrows() :
    if row['eid'] not in data_dict :
        data_dict[row['eid']] = {}
        data_dict[row['eid']]['year'] = row['year'] 
        data_dict[row['eid']]['origin_ref'] = row['origin_ref'] 
        data_dict[row['eid']]['author_count'] = row['author_count'] 
        data_dict[row['eid']]['citedby_count'] = row['citedby_count'] 
        data_dict[row['eid']]['venue_mean_citation'] = venue_mean.loc[venue_mean['origin_ref']==row['origin_ref']]['citedby_count'].iloc[0]
        data_dict[row['eid']]['aff_country'] = []
    data_dict[row['eid']]['aff_country'].append(row['country'])


  exec(code_obj, self.user_global_ns, self.user_ns)


## 전체적으로, 각각에 대해서 Normalize 또는 Standardize할 필요 있어보임.

##### CACWI (Co-Author Citations based Mutual Influence)

In [64]:
result_dict_cite = {}
for eid in data_dict :
    cur_eid = data_dict[eid]
    cur_affs = cur_eid['aff_country']
    cur_citation = cur_eid['citedby_count']
    processed = []
    for i in range(len(cur_affs)) :
        t_cur = cur_affs[:]
        t_cur = [item for item in t_cur if item != cur_affs[i]]
        t_cur = set(t_cur)

        if cur_affs[i] not in result_dict_cite :
            result_dict_cite[cur_affs[i]] = {}
            result_dict_cite[cur_affs[i]]["self"] = 0

        if cur_affs[i] not in processed :
            result_dict_cite[cur_affs[i]]["self"] += cur_citation

            for j in t_cur :
                if j not in result_dict_cite[cur_affs[i]] :
                    result_dict_cite[cur_affs[i]][j] = 0
                result_dict_cite[cur_affs[i]][j] += cur_citation
            processed.append(cur_affs[i])

cacwi = {}
max_cacwi = 0
min_cacwi = 1
for k in result_dict_cite:
    cacwi[k] = {}
    for ik in result_dict_cite[k]:
        if ik != "self" and result_dict_cite[k]["self"] == 0 :
            cacwi[k][ik] = .0001
        elif ik != "self":
            cacwi[k][ik] = result_dict_cite[k][ik] / result_dict_cite[k]["self"]
            
            if cacwi[k][ik] != 1 and cacwi[k][ik] > max_cacwi :
                max_cacwi = cacwi[k][ik]
            if cacwi[k][ik] != 0 and cacwi[k][ik] < min_cacwi :
                min_cacwi = cacwi[k][ik]



In [66]:
min_cacwi

1.29534702281974e-07

##### CAOWI (Co-Author Order based Mutual Influence)

In [61]:
result_dict_auth_rank = {}
for eid in data_dict :
    cur_eid = data_dict[eid]
    cur_affs = cur_eid['aff_country']
    cur_score = {}
    for i in range(len(cur_affs)) :
        if cur_affs[i] not in cur_score :
            cur_score[cur_affs[i]] = 0
        if i == 0:
            cur_score[cur_affs[i]] += 1

    for af in cur_score :
        if af not in result_dict_auth_rank :
            result_dict_auth_rank[af] = {}
            result_dict_auth_rank[af]["total"] = 0
            result_dict_auth_rank[af]["self"] = 0
        
        result_dict_auth_rank[af]["total"] += 1
        result_dict_auth_rank[af]["self"] += cur_score[af]
        for inner_af in cur_score :
            if af != inner_af :
                if inner_af not in result_dict_auth_rank[af] :
                    result_dict_auth_rank[af][inner_af] = 0
                result_dict_auth_rank[af][inner_af] += cur_score[inner_af]

caowi = {}
max_caowi = 0
min_caowi = 1
for k in result_dict_auth_rank:
    caowi[k] = {}
    t_total = result_dict_auth_rank[k]["total"]
    t_self = result_dict_auth_rank[k]["self"]
    for ik in result_dict_auth_rank[k] :
        if ik != "total" and ik != "self":
            caowi[k][ik] = (result_dict_auth_rank[k][ik] + t_self) / (t_total + t_self)

            if caowi[k][ik] > max_caowi :
                max_caowi = caowi[k][ik]
            if caowi[k][ik] < min_caowi :
                min_caowi = caowi[k][ik]
    

##### CAVWI (Co-Author Venue’s Citations Based Mutual Influence)

In [47]:
result_dict_venue = {}
for eid in data_dict :
    cur_eid = data_dict[eid]
    cur_affs = cur_eid['aff_country']
    cur_score = {}
    for i in range(len(cur_affs)) :
        if cur_affs[i] not in cur_score :
            cur_score[cur_affs[i]] = cur_eid['venue_mean_citation']

    for af in cur_score :
        if af not in result_dict_venue :
            result_dict_venue[af] = {}
            result_dict_venue[af]["self"] = 0
        
        result_dict_venue[af]["self"] += cur_score[af]
        for inner_af in cur_score :
            if af != inner_af :
                if inner_af not in result_dict_venue[af] :
                    result_dict_venue[af][inner_af] = 0
                result_dict_venue[af][inner_af] += cur_score[inner_af]


cavwi = {}
for k in result_dict_venue:
    cavwi[k] = {}
    for ik in result_dict_venue[k]:
        if ik != "self" and result_dict_venue[k]["self"] == 0 :
            cavwi[k][ik] = 1
        elif ik != "self":
            cavwi[k][ik] = result_dict_venue[k][ik] / result_dict_venue[k]["self"]


In [52]:
print(cacwi['France']['United States'])
print(caowi['France']['United States'])
print(cavwi['France']['United States'])


0.2885680737703109
0.45166993167375413
0.18342109867625858


In [53]:

print(cacwi['United States']['France'])
print(caowi['United States']['France'])
print(cavwi['United States']['France'])

0.01693601464985669
0.4556740587748271
0.016451720384888464


##### AOWI (author order weight based mutual influence)

In [69]:
result_dict_auth_rank = {}
for eid in data_dict :
    cur_eid = data_dict[eid]
    cur_affs = cur_eid['aff_country']
    cur_score = {}
    for i in range(len(cur_affs)) :
        if cur_affs[i] not in cur_score :
            cur_score[cur_affs[i]] = 0
        cur_score[cur_affs[i]] += 1/(i+1)

    for af in cur_score :
        if af not in result_dict_auth_rank :
            result_dict_auth_rank[af] = {}
            result_dict_auth_rank[af]["self"] = 0
        
        result_dict_auth_rank[af]["self"] += cur_score[af]
        for inner_af in cur_score :
            if af != inner_af :
                if inner_af not in result_dict_auth_rank[af] :
                    result_dict_auth_rank[af][inner_af] = 0
                result_dict_auth_rank[af][inner_af] += cur_score[inner_af]
                result_dict_auth_rank[af][inner_af] += cur_score[af]
    
aowi = {}
for k in result_dict_auth_rank:
    aowi[k] = {}
    for ik in result_dict_auth_rank[k]:
        if ik != "self" and result_dict_auth_rank[k]["self"] == 0 :
            aowi[k][ik] = 0.0000001
        elif ik != "self":
            aowi[k][ik] = result_dict_auth_rank[k][ik] / result_dict_auth_rank[k]["self"]
    

In [73]:
print(aowi['France']['United States'])

0.21976180630565245


In [74]:
print(aowi['United States']['France'])

0.017787650542302382
