<a href="https://colab.research.google.com/github/MarciaFG/scimobility/blob/main/transformation_index_for_2007_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Transformative Flows Project (2007-2022)**
**Author:** Marcia R. Ferreira (Complexity Science Hub Vienna & TU Wien)
- **Inputs:** 

1.   CWTS SQL Server [dimensions_2022jun]:


      *   Exported File:
      *   Exported File:


2.   CWTS Publication-level classification system: Meso-fields level [dimensions_2022jun_classification]
3.   Dimension reduction-based clustering: Laplacian matrix contructed from meso-field level topic matrix and second eigenvector of the matrix
4.   Dimensions database on BigQuery


- **Outputs:**

### Initialization and drivers

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Thu Apr 13 07:21:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
#!pip install psutil
#!pip install humanize
#!pip install pynput
#pip install plotly==5.4.0
#!pip install patool

# main libraries
import psutil
import humanize
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests
import torch
import nltk
import GPUtil as GPU

# plotting
import plotly.graph_objs as go
import plotly.io as pio
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from google.cloud import bigquery
from google.colab import files
%load_ext google.colab.data_table
%load_ext google.cloud.bigquery

from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Provide your credentials to the runtime
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

# declare your project 
project_id = "cshdimensionstest"

Authenticated


### Data imports

In [None]:
""" NOT RUN
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1wCFzWEAwBqH47qGQG1_-G6wPgrrs03A6'
print(id) # Verify that you have everything

downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('second_eigenvector_clustering.csv')  
clusters = pd.read_csv('second_eigenvector_clustering.csv', sep=",", index_col=0) # Dataset is now stored in a Pandas Dataframe

print(clusters.head(10))
print("The data types are as follows:\n", clusters.dtypes)
print("The type of object is:\n", type(clusters))
"""

In [None]:
""" NOT RUN
# unzip the files exported from SQL Server
#!unzip "/content/drive/My Drive/TRANSFORMATION/data_export.zip"
#!unzip "/content/drive/My Drive/TRANSFORMATION/data_export.zip" > /dev/null
"""

In [4]:
import patoolib
import tempfile
import os

# Path of the zip file in Google Drive
zip_path = "/content/drive/My Drive/TRANSFORMATION/data_export.zip"

# Name of the CSV file(s) inside the zip
csv_file_names = [  "spectral_meso_clusters.csv"
                  , "for_division_labels.csv"
                  , "grid_ranks.csv"
                  , "trajectories_au_fourfive_skill.csv"
                  , "trajectories_au_morethanfive_skill.csv"
                  , "trajectories_au_single_skill.csv"
                  , "trajectories_au_twothree_skill.csv"]

# Separator character to use in the CSV files
separator = ";"

# Extract the zip file to a temporary directory
with tempfile.TemporaryDirectory() as tmpdir:
    patoolib.extract_archive(zip_path, outdir=tmpdir)
    
    # Load each CSV file into its own dataframe
    dfs = []
    for csv_file_name in csv_file_names:
        csv_file_path = os.path.join(tmpdir, csv_file_name)
        try:
            df = pd.read_csv(csv_file_path, sep=separator, encoding='utf-8', header= None, decimal=".")
            dfs.append(df)
        except pd.errors.ParserError:
            print(f"Error loading {csv_file_name}: Skipping...")

# Print the first few rows of each dataframe
for i, df in enumerate(dfs):
    print(f"Dataframe {i}:")
    print(df.head(2))

patool: Extracting /content/drive/My Drive/TRANSFORMATION/data_export.zip ...
patool: running /usr/bin/7z x -o/tmp/tmpjfna3ml7 -- "/content/drive/My Drive/TRANSFORMATION/data_export.zip"
patool: ... /content/drive/My Drive/TRANSFORMATION/data_export.zip extracted to `/tmp/tmpjfna3ml7'.
Dataframe 0:
          0                   1                 2         3         4   \
0  row_index  second_eigenvector  original_indices  cluster2  cluster3   
1          0  -0,657980785697483               128         0         0   

         5         6          7            8       9   \
0  cluster4  cluster5  cluster10  cluster_id2  n_pubs   
1         0         0          0          128   99353   

                                                  10  \
0                                             labels   
1  inhaler - dry powder inhaler - inhaler devices...   

                                                  11  
0                                            sources  
1  International Journal o

In [5]:
# extract the datasets and store them into a pandas dataframe
spectral_meso_clusters = dfs[0]
for_division_labels = dfs[1]
grid_ranks = dfs[2]
trajectories_au_fourfive_skill = dfs[3]
trajectories_au_morethanfive_skill = dfs[4]
trajectories_au_single_skill = dfs[5]
trajectories_au_twothree_skill = dfs[6]

print(type(for_division_labels))

<class 'pandas.core.frame.DataFrame'>


## Preprocessing

In [6]:
# Use the first row as the header
spectral_meso_clusters.columns = spectral_meso_clusters.iloc[0]
for_division_labels.columns = for_division_labels.iloc[0]
grid_ranks.columns = grid_ranks.iloc[0]

# Remove the first row (which is now the header)
spectral_meso_clusters = spectral_meso_clusters[1:]
for_division_labels = for_division_labels[1:]
grid_ranks = grid_ranks[1:]

print(spectral_meso_clusters.head())
print(for_division_labels.head())
print(grid_ranks.head())

0 row_index   second_eigenvector original_indices cluster2 cluster3 cluster4  \
1         0   -0,657980785697483              128        0        0        0   
2         9  -0,0796790037139393              109        4        3        2   
3         6  -0,0866583191228655              146        3        2        1   
4         7  -0,0835526247765448              120        3        2        1   
5         8  -0,0832388670665863              247        4        2        2   

0 cluster5 cluster10 cluster_id2  n_pubs  \
1        0         0         128   99353   
2        1         0         109  106502   
3        1         0         146   91599   
4        1         0         120  102555   
5        1         0         247   65569   

0                                             labels  \
1  inhaler - dry powder inhaler - inhaler devices...   
2  CRT response - CRT device - CRT implantation -...   
3  chiral selector - electrochromatography - plat...   
4  household air pollution - c

In [7]:
def convert_to_float(val):
    if isinstance(val, str) and val.replace('.', '', 1).isdigit():
        return float(val.replace(',', '.'))
    return val

# Apply the function to all elements of the dataframe
grid_ranks = grid_ranks.applymap(convert_to_float)
spectral_meso_clusters = spectral_meso_clusters.applymap(convert_to_float)

print(grid_ranks.dtypes)
print(spectral_meso_clusters.dtypes)

0
institution          object
institution_name     object
p                   float64
p_top1               object
tncs                 object
mncs                 object
pp_top_prop1         object
tncs_rnk            float64
pp_top_prop1_rnk    float64
dtype: object
0
row_index             float64
second_eigenvector     object
original_indices      float64
cluster2              float64
cluster3              float64
cluster4              float64
cluster5              float64
cluster10             float64
cluster_id2           float64
n_pubs                float64
labels                 object
sources                object
dtype: object


In [8]:
from pandas.core.dtypes.dtypes import dtypes
from numpy.core.multiarray import dtype
headers = ['researcher_id', 'grid_id', 'start', 'end', 'Lenght', 'for_division_id', 'meso_field', 'spectral_cluster_id', 'concatenated_fields', 'year', 'n_pubs']

# set the new column names using the list
trajectories_au_morethanfive_skill.columns = headers
trajectories_au_fourfive_skill.columns = headers
trajectories_au_single_skill.columns = headers
trajectories_au_twothree_skill.columns = headers

# print the updated column names
print(trajectories_au_morethanfive_skill.columns)
print(trajectories_au_morethanfive_skill.dtypes)

Index(['researcher_id', 'grid_id', 'start', 'end', 'Lenght', 'for_division_id',
       'meso_field', 'spectral_cluster_id', 'concatenated_fields', 'year',
       'n_pubs'],
      dtype='object')
researcher_id          object
grid_id                object
start                   int64
end                     int64
Lenght                  int64
for_division_id         int64
meso_field              int64
spectral_cluster_id     int64
concatenated_fields    object
year                    int64
n_pubs                  int64
dtype: object


In [9]:
trajectories_au_morethanfive_skill.describe()

Unnamed: 0,start,end,Lenght,for_division_id,meso_field,spectral_cluster_id,year,n_pubs
count,59980980.0,59980980.0,59980980.0,59980980.0,59980980.0,59980980.0,59980980.0,59980980.0
mean,2009.957,2019.942,9.985488,8.731703,199.8764,42.12178,2015.399,1.398404
std,3.504475,2.249119,3.923751,3.7173,163.3995,33.1916,4.048403,1.558997
min,2007.0,2009.0,2.0,1.0,0.0,0.0,2007.0,1.0
25%,2007.0,2020.0,7.0,6.0,66.0,8.0,2012.0,1.0
50%,2008.0,2021.0,11.0,10.0,157.0,38.0,2016.0,1.0
75%,2012.0,2021.0,14.0,11.0,297.0,76.0,2019.0,1.0
max,2019.0,2021.0,14.0,22.0,864.0,86.0,2021.0,3719.0


In [10]:
#print(trajectories_au_single_skill.describe())
print(trajectories_au_single_skill.head())

        researcher_id        grid_id  start   end  Lenght  for_division_id  \
0  ur.010000000127.95  grid.452405.2   2007  2012       5                6   
1  ur.010000000127.95  grid.452405.2   2007  2012       5                6   
2  ur.010000000127.95  grid.452405.2   2007  2012       5                6   
3  ur.010000000127.95  grid.452405.2   2007  2012       5                6   
4  ur.010000000643.41  grid.39381.30   2019  2021       2                6   

   meso_field  spectral_cluster_id concatenated_fields  year  n_pubs  
0           8                    7          06 - 8 - 7  2007       1  
1           8                    7          06 - 8 - 7  2009       1  
2           8                    7          06 - 8 - 7  2010       1  
3           8                    7          06 - 8 - 7  2012       2  
4         208                   83       06 - 208 - 83  2019       1  


In [19]:
def calculate_org_sequence(df):
    # select the desired columns and drop duplicates
    df = df[['researcher_id', 'grid_id', 'start', 'end']].drop_duplicates().reset_index(drop=True)
    
    # calculate the org_sequence using rank()
    df['org_sequence'] = df.groupby('researcher_id')['start'].rank(method='dense')
    
    return df

# calculate org_sequence for each dataframe
sq_1_skill_df = calculate_org_sequence(trajectories_au_single_skill)
sq_2_3_skill_df = calculate_org_sequence(trajectories_au_twothree_skill)
sq_4_5_skill_df = calculate_org_sequence(trajectories_au_fourfive_skill)
sq_5_or_more_skill_df = calculate_org_sequence(trajectories_au_morethanfive_skill)

print(sq_1_skill_df.head(10))

# select all rows that have org_sequence > 1
sq_1_skill_df_filtered = sq_1_skill_df[sq_1_skill_df['org_sequence'] > 1]
print(sq_1_skill_df_filtered.head(10))

# select all rows that have researcher_id = 'ur.01000012260.80'
sq_1_skill_df_filtered_au = sq_1_skill_df[sq_1_skill_df['researcher_id'] == 'ur.01000012260.80']
print(sq_1_skill_df_filtered_au.head(10))

        researcher_id        grid_id  start   end  org_sequence
0  ur.010000000127.95  grid.452405.2   2007  2012           1.0
1  ur.010000000643.41  grid.39381.30   2019  2021           1.0
2  ur.010000001602.92  grid.15866.3c   2015  2017           1.0
3  ur.010000007232.43    grid.5596.f   2018  2021           1.0
4  ur.010000010373.10    grid.4886.2   2008  2017           1.0
5  ur.010000011751.09  grid.135519.a   2013  2015           1.0
6   ur.01000001217.11  grid.32224.35   2012  2014           1.0
7   ur.01000001415.60  grid.427788.6   2015  2017           1.0
8  ur.010000020707.55  grid.252119.c   2011  2013           1.0
9  ur.010000022373.01  grid.411154.4   2016  2021           1.0
          researcher_id        grid_id  start   end  org_sequence
52    ur.01000012260.80  grid.462949.4   2013  2017           2.0
53    ur.01000012260.80  grid.503253.2   2015  2021           3.0
101  ur.010000222435.51  grid.47716.33   2019  2021           2.0
105   ur.01000022536.36  grid.41

**Looks good!**