In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Sun Jan 14 21:19:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Authenticated


# Preprocessing

In [None]:
# @title Loading the datasets
import pandas as pd
import zipfile

column_names = [
    'researcher_id',
    'grid_id',
    'pub_year',
    'pubs',
    'cluster_id1',
    'LR_main_field_id',
    'LR_main_field',
    'native',
    'outgoer',
    'newcomer'
]

PATH = '/content/drive/My Drive/revisions_natcomms/data/raw_vector_df.zip'

with zipfile.ZipFile(PATH, 'r') as z:
    with z.open('raw_vector_df.csv') as f:
        df = pd.read_csv(f, sep = ';', header=None, names=column_names)

print(df.head(1))

print(df['LR_main_field'].unique())
print("Number of unique publication years:", df['pub_year'].nunique())
print("Number of unique clusters:", df['cluster_id1'].nunique())
print("Number of unique researchers:", df['researcher_id'].nunique())
print("Number of unique institutions:", df['grid_id'].nunique())

In [None]:
# Create separate DataFrames for each category in 'LR_main_field'
import os

df_biomedical = df[df['LR_main_field'] == 'Biomedical and health sciences']
df_physical = df[df['LR_main_field'] == 'Physical sciences and engineering']
df_life_earth = df[df['LR_main_field'] == 'Life and earth sciences']
df_social = df[df['LR_main_field'] == 'Social sciences and humanities']
df_math_cs = df[df['LR_main_field'] == 'Mathematics and computer science']

dir_path = "/content/drive/My Drive/revisions_natcomms/data/"

df_biomedical.to_csv(dir_path + 'biomedical_and_health_sciences.csv', index=False)
df_physical.to_csv(dir_path + 'physical_sciences_and_engineering.csv', index=False)
df_life_earth.to_csv(dir_path + 'life_and_earth_sciences.csv', index=False)
df_social.to_csv(dir_path + 'social_sciences_and_humanities.csv', index=False)
df_math_cs.to_csv(dir_path + 'mathematics_and_computer_science.csv', index=False)

In [4]:
# Read the data back
import pandas as pd

dir_path = "/content/drive/My Drive/revisions_natcomms/data/"

biomedical_csv_path = dir_path + 'biomedical_and_health_sciences.csv'
physical_csv_path = dir_path + 'physical_sciences_and_engineering.csv'
life_earth_csv_path = dir_path + 'life_and_earth_sciences.csv'
social_csv_path = dir_path + 'social_sciences_and_humanities.csv'
math_cs_csv_path = dir_path + 'mathematics_and_computer_science.csv'

df_biomedical = pd.read_csv(biomedical_csv_path)
df_physical = pd.read_csv(physical_csv_path)
df_life_earth = pd.read_csv(life_earth_csv_path)
df_social = pd.read_csv(social_csv_path)
df_math_cs = pd.read_csv(math_cs_csv_path)

In [6]:
df_social.head(10)

Unnamed: 0,researcher_id,grid_id,pub_year,pubs,cluster_id1,LR_main_field_id,LR_main_field,native,outgoer,newcomer
0,ur.010000000761.51,grid.412304.0,2014,1,567,1,Social sciences and humanities,1,0,0
1,ur.01000000145.32,grid.411638.9,2020,1,1980,1,Social sciences and humanities,0,0,1
2,ur.010000002003.53,grid.12380.38,2013,1,737,1,Social sciences and humanities,1,0,0
3,ur.01000000410.68,grid.5841.8,2008,4,424,1,Social sciences and humanities,0,1,0
4,ur.01000000410.68,grid.5841.8,2009,1,424,1,Social sciences and humanities,0,1,0
5,ur.01000000410.68,grid.5841.8,2010,1,547,1,Social sciences and humanities,0,1,0
6,ur.01000000410.68,grid.5841.8,2011,1,424,1,Social sciences and humanities,0,1,0
7,ur.01000000410.68,grid.150338.c,2012,2,424,1,Social sciences and humanities,0,0,1
8,ur.01000000410.68,grid.150338.c,2013,2,424,1,Social sciences and humanities,0,0,1
9,ur.01000000410.68,grid.5841.8,2013,2,424,1,Social sciences and humanities,0,1,0


In [5]:
# Define the columns to keep and their new names
columns_to_keep = {'researcher_id': 'aut', 'grid_id': 'ins', 'cluster_id1': 'sk', 'pubs': 'w'}

def split_and_save(df, field_name, dir_path):
    for category in ['native', 'newcomer', 'outgoer']:
        df_filtered = df[df[category] == 1][list(columns_to_keep.keys())].rename(columns=columns_to_keep)
        file_name = f'df_{field_name}_{category}.csv'
        df_filtered.to_csv(dir_path + file_name, index=False)

split_and_save(df_biomedical, 'biomedical', dir_path)
split_and_save(df_physical, 'physical', dir_path)
split_and_save(df_life_earth, 'life_earth', dir_path)
split_and_save(df_social, 'social', dir_path)
split_and_save(df_math_cs, 'math_cs', dir_path)

# Social sciences and humanities

In [21]:
# @title nat vs out
# GET AUTHOR-INSTITUTION COSINE FOR NATIVES AND OUTGOERS
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_df(file_path):
    df = pd.read_csv(file_path)
    return df

dir_path = "/content/drive/My Drive/revisions_natcomms/data/"

df_social_native = load_df(dir_path + 'df_social_native.csv')
df_social_outgoer = load_df(dir_path + 'df_social_outgoer.csv')

unique_skills_native = df_social_native['sk'].unique()
unique_skills_outgoer = df_social_outgoer['sk'].unique()
combined_skills = set(unique_skills_native) | set(unique_skills_outgoer)
global_unique_skills = list(combined_skills)

grouped_native = df_social_native.groupby('ins')['sk'].apply(list).reset_index()
grouped_outgoer = df_social_outgoer.groupby('ins')['sk'].apply(list).reset_index()

native_skill_dict = {row['ins']: row['sk'] for _, row in grouped_native.iterrows()}
outgoer_skill_dict = {row['ins']: row['sk'] for _, row in grouped_outgoer.iterrows()}

def compute_cosine_similarity(skill_vector1, skill_vector2):
    vector1 = np.zeros(len(global_unique_skills))
    vector2 = np.zeros(len(global_unique_skills))

    for skill_id in skill_vector1:
        idx = global_unique_skills.index(skill_id)
        vector1[idx] = 1

    for skill_id in skill_vector2:
        idx = global_unique_skills.index(skill_id)
        vector2[idx] = 1

    similarity = cosine_similarity([vector1], [vector2])[0][0]
    return similarity

cosine_similarities = {}

for institution in native_skill_dict.keys():
    native_skills = native_skill_dict[institution]
    outgoer_skills = outgoer_skill_dict.get(institution, [])

    similarity_score = compute_cosine_similarity(native_skills, outgoer_skills)
    cosine_similarities[institution] = similarity_score

In [None]:
# @title nat vs out with weighted skills
# GET AUTHOR-INSTITUTION COSINE FOR NATIVES AND OUTGOERS WITH WEIGHTED SKILLS
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_df(file_path):
    df = pd.read_csv(file_path)
    return df

dir_path = "/content/drive/My Drive/revisions_natcomms/data/"

df_social_native = load_df(dir_path + 'df_social_native.csv')
df_social_outgoer = load_df(dir_path + 'df_social_outgoer.csv')

# Assuming you have the 'w' column in your CSV files for the weights
# Modify the column name if needed
native_weights = df_social_native['w']
outgoer_weights = df_social_outgoer['w']

unique_skills_native = df_social_native['sk'].unique()
unique_skills_outgoer = df_social_outgoer['sk'].unique()
combined_skills = set(unique_skills_native) | set(unique_skills_outgoer)
global_unique_skills = list(combined_skills)

grouped_native = df_social_native.groupby('ins').agg({'sk': list, 'w': list}).reset_index()
grouped_outgoer = df_social_outgoer.groupby('ins').agg({'sk': list, 'w': list}).reset_index()

native_skill_dict = {row['ins']: {'skills': row['sk'], 'weights': row['w']} for _, row in grouped_native.iterrows()}
outgoer_skill_dict = {row['ins']: {'skills': row['sk'], 'weights': row['w']} for _, row in grouped_outgoer.iterrows()}

def compute_weighted_cosine_similarity(skill_data1, skill_data2):
    skills1 = skill_data1['skills']
    weights1 = skill_data1['weights']
    skills2 = skill_data2['skills']
    weights2 = skill_data2['weights']

    vector1 = np.zeros(len(global_unique_skills))
    vector2 = np.zeros(len(global_unique_skills))

    for skill_id, weight in zip(skills1, weights1):
        idx = global_unique_skills.index(skill_id)
        vector1[idx] = weight

    for skill_id, weight in zip(skills2, weights2):
        idx = global_unique_skills.index(skill_id)
        vector2[idx] = weight

    similarity = cosine_similarity([vector1], [vector2])[0][0]
    return similarity

weighted_cosine_similarities = {}

for institution in native_skill_dict.keys():
    native_data = native_skill_dict[institution]
    outgoer_data = outgoer_skill_dict.get(institution, {'skills': [], 'weights': []})

    similarity_score = compute_weighted_cosine_similarity(native_data, outgoer_data)
    weighted_cosine_similarities[institution] = similarity_score


In [18]:
weighted_cosine_similarities

{'grid.1001.0': 0.663986350424814,
 'grid.1002.3': 0.713375974828835,
 'grid.10025.36': 0.5400819976828961,
 'grid.1003.2': 0.7152497616180231,
 'grid.1004.5': 0.6518699457940272,
 'grid.10041.34': 0.5211789575097057,
 'grid.10049.3c': 0.6411868146636929,
 'grid.1005.4': 0.62969226244732,
 'grid.1006.7': 0.6336134391824302,
 'grid.10067.30': 0.4482085641669645,
 'grid.1007.6': 0.6585063412109973,
 'grid.1008.9': 0.7335205229660391,
 'grid.1009.8': 0.6424330500384571,
 'grid.1010.0': 0.6967874446952339,
 'grid.1011.1': 0.6445718106976,
 'grid.1012.2': 0.643534516968048,
 'grid.1013.3': 0.6648325089397329,
 'grid.1014.4': 0.6193250866892168,
 'grid.1016.6': 0.5344913190588354,
 'grid.1017.7': 0.5604440710951024,
 'grid.1018.8': 0.519092099997924,
 'grid.1019.9': 0.540061311597422,
 'grid.1020.3': 0.5104239604648815,
 'grid.1021.2': 0.6492271855869498,
 'grid.10214.36': 0.2611164839335468,
 'grid.10215.37': 0.5805772257994255,
 'grid.1022.1': 0.673729782544478,
 'grid.10223.32': 0.5908659

In [None]:
cosine_similarities