In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Sun Jan 14 20:57:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Authenticated


# Preprocessing

In [20]:
# @title Loading the datasets
import pandas as pd
import zipfile

column_names = [
    'researcher_id',
    'grid_id',
    'pub_year',
    'pubs',
    'cluster_id1',
    'LR_main_field_id',
    'LR_main_field',
    'native',
    'outgoer',
    'newcomer'
]

PATH = '/content/drive/My Drive/revisions_natcomms/data/raw_vector_df.zip'

with zipfile.ZipFile(PATH, 'r') as z:
    with z.open('raw_vector_df.csv') as f:
        df = pd.read_csv(f, sep = ';', header=None, names=column_names)

print(df.head(1))

KeyboardInterrupt: 

In [None]:
df.head(10)

In [None]:
print(df['LR_main_field'].unique())
print("Number of unique publication years:", df['pub_year'].nunique())
print("Number of unique clusters:", df['cluster_id1'].nunique())
print("Number of unique researchers:", df['researcher_id'].nunique())
print("Number of unique institutions:", df['grid_id'].nunique())

In [None]:
# Create separate DataFrames for each category in 'LR_main_field'
import os

df_biomedical = df[df['LR_main_field'] == 'Biomedical and health sciences']
df_physical = df[df['LR_main_field'] == 'Physical sciences and engineering']
df_life_earth = df[df['LR_main_field'] == 'Life and earth sciences']
df_social = df[df['LR_main_field'] == 'Social sciences and humanities']
df_math_cs = df[df['LR_main_field'] == 'Mathematics and computer science']

dir_path = "/content/drive/My Drive/revisions_natcomms/data/"

df_biomedical.to_csv(dir_path + 'biomedical_and_health_sciences.csv', index=False)
df_physical.to_csv(dir_path + 'physical_sciences_and_engineering.csv', index=False)
df_life_earth.to_csv(dir_path + 'life_and_earth_sciences.csv', index=False)
df_social.to_csv(dir_path + 'social_sciences_and_humanities.csv', index=False)
df_math_cs.to_csv(dir_path + 'mathematics_and_computer_science.csv', index=False)

In [None]:
# Define the columns to keep and their new names
columns_to_keep = {'researcher_id': 'aut', 'grid_id': 'ins', 'cluster_id1': 'sk', 'pubs': 'w'}

def split_and_save(df, field_name, dir_path):
    for category in ['native', 'newcomer', 'outgoer']:
        df_filtered = df[df[category] == 1][list(columns_to_keep.keys())].rename(columns=columns_to_keep)
        file_name = f'df_{field_name}_{category}.csv'
        df_filtered.to_csv(dir_path + file_name, index=False)


split_and_save(df_biomedical, 'biomedical', dir_path)
split_and_save(df_physical, 'physical', dir_path)
split_and_save(df_life_earth, 'life_earth', dir_path)
split_and_save(df_social, 'social', dir_path)
split_and_save(df_math_cs, 'math_cs', dir_path)

# Social sciences and humanities

In [18]:
# @title nat vs out
# GET AUTHOR-INSTITUTION COSINE FOR NATIVES AND OUTGOERS
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

dir_path = "/content/drive/My Drive/revisions_natcomms/data/"
df_social_native = load_df(dir_path + 'df_social_native.csv')
df_social_outgoer = load_df(dir_path + 'df_social_outgoer.csv')

unique_skills_native = df_social_native['sk'].unique()
unique_skills_outgoer = df_social_outgoer['sk'].unique()
combined_skills = set(unique_skills_native) | set(unique_skills_outgoer)
global_unique_skills = list(combined_skills)

grouped_native = df_social_native.groupby('ins')['sk'].apply(list).reset_index()
grouped_outgoer = df_social_outgoer.groupby('ins')['sk'].apply(list).reset_index()

native_skill_dict = {row['ins']: row['sk'] for _, row in grouped_native.iterrows()}
outgoer_skill_dict = {row['ins']: row['sk'] for _, row in grouped_outgoer.iterrows()}

def compute_cosine_similarity(skill_vector1, skill_vector2):
    vector1 = np.zeros(len(global_unique_skills))
    vector2 = np.zeros(len(global_unique_skills))

    for skill_id in skill_vector1:
        idx = global_unique_skills.index(skill_id)
        vector1[idx] = 1

    for skill_id in skill_vector2:
        idx = global_unique_skills.index(skill_id)
        vector2[idx] = 1

    similarity = cosine_similarity([vector1], [vector2])[0][0]
    return similarity

cosine_similarities = {}
for institution in native_skill_dict.keys():
    native_skills = native_skill_dict[institution]
    outgoer_skills = outgoer_skill_dict.get(institution, [])

    similarity_score = compute_cosine_similarity(native_skills, outgoer_skills)
    cosine_similarities[institution] = similarity_score

In [None]:
# @title not run
#%%GET AUTHOR-INSTITUTION SKILLS MATRIX FOR NATIVES AND OUTGOERS

out = create_df(data.loc[(data['outgoer'] == 1) & (data['LR_main_field']== 'Biomedical and health sciences')].rename(columns={'researcher_id': 'aut', 'grid_id': 'inst', 'cluster_id1':'sk'}))
M_out_as, unique_sk_out, nodes_ins_out = create_matrix(out)

nat = create_df(data.loc[(data['native'] == 1) & (data['LR_main_field'] == 'Biomedical and health sciences')].rename(columns={'researcher_id': 'aut', 'grid_id': 'inst', 'cluster_id1':'sk'}))
M_res_as, unique_sk_res, nodes_ins_res = create_matrix(nat)


#df = create_df('C:/Documents/sci_mobility/homophily_adaptation/datasets/shuffle/outgoers.csv')
#M_out_as, unique_sk_out, nodes_ins_out = create_matrix(df)

#%%GET MAPPINGS FROM NODES TO UNIQUE_INS
unique_ins = np.unique(nodes_ins_out)

ins2nodes_res = {}
for i, x in enumerate(unique_ins):
    print(i)
    idx = np.where(nodes_ins_res == x)[0]
    ins2nodes_res[x] = idx

ins2nodes_out = {}
for i, x in enumerate(unique_ins):
    print(i)
    idx = np.where(nodes_ins_out == x)[0]
    ins2nodes_out[x] = idx

#%%PUT THE TWO MATRICES IN THE SAME FORMAT
import scipy

unique_sk_res = [int(x) for x in unique_sk_res]
unique_sk_out = [int(x) for x in unique_sk_out]

M_out_as_clean = scipy.sparse.lil_matrix((M_out_as.shape[0], len(unique_sk_res)))
M_res_as = M_res_as.tolil()

for i, idx in enumerate(unique_sk_out):
    print(i, idx)
    M_out_as_clean[:, idx] = M_out_as[:, i]

#%% GET ORIGINAL M_IS FOR RESIDENTS AND OUTGOERS
M_out_as = M_out_as_clean.tocsr()
M_res_as = M_res_as.tocsr()

idx = list(range(len(nodes_ins_res)))
M_res_is = np.zeros((len(unique_ins), M_res_as.shape[1]))
for i, x in enumerate(unique_ins):
    M_res_is[i, :] = M_res_as[ins2nodes_res[x], :].sum(axis=0)

idx = list(range(len(nodes_ins_out)))
M_out_is = np.zeros((len(unique_ins), M_out_as.shape[1]))
for i, x in enumerate(unique_ins):
    M_out_is[i, :] = M_out_as[ins2nodes_out[x], :].sum(axis=0)

#%% COMPUTE COSINES OF UNSHUFFLED CASE
from numpy import dot
from numpy.linalg import norm

cs = [dot(M_out_is[i, :], M_res_is[i, :])/(norm((M_out_is[i, :])*norm(M_res_is[i, :]))) for i in range(len(unique_ins))]