In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

/bin/bash: line 1: nvidia-smi: command not found
Your runtime has 13.6 gigabytes of available RAM

To enable a high-RAM runtime, select the Runtime → "Change runtime type"
menu, and then select High-RAM in the Runtime shape dropdown. Then, 
re-execute this cell.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()
print('Authenticated')

In [None]:
# @title Loading the datasets
import pandas as pd

PATH = '/content/drive/My Drive/revisions_natcomms/data/DATASET.csv'

data = pd.read_csv(PATH, delimiter=';')

NEW = data.loc[(data['Newcomer'] == 1) & (data['LR_main_field_id']== 'Biomedical and health sciences')]
OUT = data.loc[(data['Outgoer'] == 1) & (data['LR_main_field_id']== 'Biomedical and health sciences')]
NAT = data.loc[(data['Native'] == 1) & (data['LR_main_field_id'] == 'Biomedical and health sciences')]

In [None]:
#%% FUNCTIONS TO CREATE AUTHOR-INSTITUTION TOPICS' MATRIX
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite

def create_df(path):
    #import data
    df = pd.read_csv(path, header=None, sep=';')
    df.drop([3], axis=1, inplace=True)
    df.columns = ['aut', 'ins', 'sk']
    return df

def create_matrix(df):
    print(1)
    #get author-institution and skills list
    aut_list = list(df.aut)
    ins_list = list(df.ins)
    sk_list = list(df.sk)
    aut_ins_list = [x+'__'+y for x,y in zip(aut_list, ins_list)]
    unique_aut_ins = np.unique(aut_ins_list).tolist()
    unique_sk = np.unique(sk_list).tolist()
    print(2)
    #create bipartite graph
    df_edges = pd.DataFrame(list(zip(aut_ins_list, sk_list)), columns=['aut_ins', 'sk'])
    df_edges.to_csv('edge_list.edgelist', sep=' ', header=False, index=False)
    G = bipartite.read_edgelist("edge_list.edgelist")
    unique_sk = [str(x) for x in unique_sk]
    M_as = bipartite.biadjacency_matrix(G, row_order = unique_aut_ins, column_order = unique_sk)
    G.clear()
    nodes_ins = [x.partition('__')[2] for x in unique_aut_ins]
    nodes_ins = np.array(nodes_ins)
    unique_ins = np.unique(nodes_ins)
    return M_as, unique_sk, nodes_ins

# 1. Biomedical and health sciences

In [None]:
#%%GET AUTHOR-INSTITUTION SKILLS MATRIX FOR RESIDENTS AND OUTGOINGS

df = create_df('C:/Documents/sci_mobility/homophily_adaptation/datasets/shuffle/residents.csv')
M_res_as, unique_sk_res, nodes_ins_res = create_matrix(df)

df = create_df('C:/Documents/sci_mobility/homophily_adaptation/datasets/shuffle/outgoers.csv')
M_out_as, unique_sk_out, nodes_ins_out = create_matrix(df)

#%%GET MAPPINGS FROM NODES TO UNIQUE_INS
unique_ins = np.unique(nodes_ins_out)

ins2nodes_res = {}
for i, x in enumerate(unique_ins):
    print(i)
    idx = np.where(nodes_ins_res == x)[0]
    ins2nodes_res[x] = idx

ins2nodes_out = {}
for i, x in enumerate(unique_ins):
    print(i)
    idx = np.where(nodes_ins_out == x)[0]
    ins2nodes_out[x] = idx