In [1]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.cluster import KMeans
import json

import matplotlib.pyplot as plt
import seaborn as sns

import requests
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ppg_data = pd.read_csv('/media/work/icarovasconcelos/mono/authors-ppg7-6.csv')
ppg_data.head()

Unnamed: 0,ano_calendario,ppg_codigo,ppg_nome,ppg_nota,institution_id,ies_sigla,nome_docente,doutorado_ano,regime_trabalho,carga_horaria,link_do_lattes,author_id,bolsista_produtividade,extrato_bolsa_produtividade,doutorado_institution_id,doutorado_institution_name,doutorado_ppg_codigo,doutorado_supervisor_id,doutorado_supervisor_name
0,2022,42005019016P8,CC,7,I45643870,PUC/RS,TIAGO COELHO FERRETO,2010,Integral,40,http://lattes.cnpq.br/8685431534934812,A5009859711,VERDADEIRO,DT2,I45643870,Pontifícia Universidade Católica do Rio Grande...,31005012004P9,A5071130875,César Augusto Fonticielha De Rose
1,2022,42005019016P8,CC,7,I45643870,PUC/RS,SORAIA RAUPP MUSSE,2000,Integral,40,http://lattes.cnpq.br/2302314954133011,A5059434669,VERDADEIRO,PQ1C,I5124864,École polytechnique fédérale de Lausanne,,A5005709068,Dr Daniel Thalmann
2,2022,42005019016P8,CC,7,I45643870,PUC/RS,SABRINA DOS SANTOS MARCZAK,2011,Integral,40,http://lattes.cnpq.br/9458496222461501,A5014651524,VERDADEIRO,PQ2,I212119943,University of Victoria,,A5007049054,Daniela Damian
3,2022,42005019016P8,CC,7,I45643870,PUC/RS,RODRIGO COELHO BARROS,2013,Integral,20,http://lattes.cnpq.br/8172124241767828,A5039629929,VERDADEIRO,PQ2,I17974374,Universidade de São Paulo,33002045004P1,A5079499583,André Carlos Ponce de Leon Ferreira de Carvalho
4,2022,42005019016P8,CC,7,I45643870,PUC/RS,RAFAEL PRIKLADNICKI,2009,Integral,40,http://lattes.cnpq.br/2007065934836962,A5024645888,VERDADEIRO,PQ1D,I45643870,Pontifícia Universidade Católica do Rio Grande...,31005012004P9,A5022404709,Jorge Luis Nicolas Audy


In [3]:
ppg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ano_calendario               504 non-null    int64 
 1   ppg_codigo                   504 non-null    object
 2   ppg_nome                     504 non-null    object
 3   ppg_nota                     504 non-null    int64 
 4   institution_id               504 non-null    object
 5   ies_sigla                    504 non-null    object
 6   nome_docente                 504 non-null    object
 7   doutorado_ano                504 non-null    int64 
 8   regime_trabalho              504 non-null    object
 9   carga_horaria                504 non-null    int64 
 10  link_do_lattes               504 non-null    object
 11  author_id                    498 non-null    object
 12  bolsista_produtividade       504 non-null    object
 13  extrato_bolsa_produtividade  277 no

In [4]:
ppg_data = ppg_data.dropna(subset=['author_id'])


In [5]:
ppg_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498 entries, 0 to 503
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ano_calendario               498 non-null    int64 
 1   ppg_codigo                   498 non-null    object
 2   ppg_nome                     498 non-null    object
 3   ppg_nota                     498 non-null    int64 
 4   institution_id               498 non-null    object
 5   ies_sigla                    498 non-null    object
 6   nome_docente                 498 non-null    object
 7   doutorado_ano                498 non-null    int64 
 8   regime_trabalho              498 non-null    object
 9   carga_horaria                498 non-null    int64 
 10  link_do_lattes               498 non-null    object
 11  author_id                    498 non-null    object
 12  bolsista_produtividade       498 non-null    object
 13  extrato_bolsa_produtividade  276 non-nul

In [6]:
ppg_data = ppg_data.fillna("null")


In [7]:
ppg_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498 entries, 0 to 503
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ano_calendario               498 non-null    int64 
 1   ppg_codigo                   498 non-null    object
 2   ppg_nome                     498 non-null    object
 3   ppg_nota                     498 non-null    int64 
 4   institution_id               498 non-null    object
 5   ies_sigla                    498 non-null    object
 6   nome_docente                 498 non-null    object
 7   doutorado_ano                498 non-null    int64 
 8   regime_trabalho              498 non-null    object
 9   carga_horaria                498 non-null    int64 
 10  link_do_lattes               498 non-null    object
 11  author_id                    498 non-null    object
 12  bolsista_produtividade       498 non-null    object
 13  extrato_bolsa_produtividade  498 non-nul

In [8]:
ppg_data.drop_duplicates(subset=['author_id'], inplace=True)

In [9]:
ppg_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 491 entries, 0 to 503
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ano_calendario               491 non-null    int64 
 1   ppg_codigo                   491 non-null    object
 2   ppg_nome                     491 non-null    object
 3   ppg_nota                     491 non-null    int64 
 4   institution_id               491 non-null    object
 5   ies_sigla                    491 non-null    object
 6   nome_docente                 491 non-null    object
 7   doutorado_ano                491 non-null    int64 
 8   regime_trabalho              491 non-null    object
 9   carga_horaria                491 non-null    int64 
 10  link_do_lattes               491 non-null    object
 11  author_id                    491 non-null    object
 12  bolsista_produtividade       491 non-null    object
 13  extrato_bolsa_produtividade  491 non-nul

In [10]:
'''works_by_author_dict = {}

for author_id in ppg_data['author_id']:
    url = f'https://api.openalex.org/works?filter=author.id:{author_id},from_publication_date:2004-01-01'
    response = requests.get(url)
    data = response.json()
    works_by_author_dict[author_id] = data

n = 0
for w in works_by_author_dict.keys():
    print(f'Author: {w} | Works: {works_by_author_dict[w]["meta"]["count"]}')
    n += works_by_author_dict[w]["meta"]["count"]

print(f'Total works: {n}')'''

'works_by_author_dict = {}\n\nfor author_id in ppg_data[\'author_id\']:\n    url = f\'https://api.openalex.org/works?filter=author.id:{author_id},from_publication_date:2004-01-01\'\n    response = requests.get(url)\n    data = response.json()\n    works_by_author_dict[author_id] = data\n\nn = 0\nfor w in works_by_author_dict.keys():\n    print(f\'Author: {w} | Works: {works_by_author_dict[w]["meta"]["count"]}\')\n    n += works_by_author_dict[w]["meta"]["count"]\n\nprint(f\'Total works: {n}\')'

In [11]:
ppg_data = ppg_data[ppg_data['author_id'] != 'A5012278873']

In [12]:
ppg_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 490 entries, 0 to 503
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ano_calendario               490 non-null    int64 
 1   ppg_codigo                   490 non-null    object
 2   ppg_nome                     490 non-null    object
 3   ppg_nota                     490 non-null    int64 
 4   institution_id               490 non-null    object
 5   ies_sigla                    490 non-null    object
 6   nome_docente                 490 non-null    object
 7   doutorado_ano                490 non-null    int64 
 8   regime_trabalho              490 non-null    object
 9   carga_horaria                490 non-null    int64 
 10  link_do_lattes               490 non-null    object
 11  author_id                    490 non-null    object
 12  bolsista_produtividade       490 non-null    object
 13  extrato_bolsa_produtividade  490 non-nul

In [13]:
cursor = '*'

select = ",".join((
    'id',
    'ids',
    'title',
    'display_name',
    'publication_year',
    'publication_date',
    'primary_location',
    'open_access',
    'authorships',
    'cited_by_count',
    'is_retracted',
    'is_paratext',
    'updated_date',
    'created_date',
    'concepts',

))

n_works = 0
calls = 0
works = []
try:
    for author_id in ppg_data['author_id']:
        url = f'https://api.openalex.org/works?filter=author.id:{author_id},from_publication_date:2004-01-01'
        # loop through pages
        cursor = '*'
        while cursor:
            # set cursor value and request page from OpenAlex
            url_1 = f'{url}&select={select}&cursor={cursor}'
            page_with_results = requests.get(url_1).json()

            results = page_with_results['results']
            works.extend(results)
            n_works += len(works)
            # update cursor to meta.next_cursor
            cursor = page_with_results['meta']['next_cursor']
            calls += 1
            if calls in [5, 10, 20, 50, 100] or calls % 500 == 0:
                print(f'{calls} api requests made so far')

    print(f'done. made {calls} api requests. collected {len(works)} works')

except Exception as e:
    print(f'An exception occurred: {str(e)}')

with open('works_since_2004.json', 'w') as f:
    json.dump(works, f)

5 api requests made so far


10 api requests made so far


20 api requests made so far


50 api requests made so far


100 api requests made so far


500 api requests made so far


1000 api requests made so far


1500 api requests made so far


2000 api requests made so far


2500 api requests made so far


3000 api requests made so far


done. made 3098 api requests. collected 340819 works


In [None]:
data = []
for work in works:
    for authorship in work['authorships']:
        if authorship:
            author = authorship['author']
            author_id = author['id'].split('/')[-1] if author else None
            author_name = author['display_name'] if author else None
            author_position = authorship['author_position']
            for institution in authorship['institutions']:
                if institution:
                    institution_id = institution['id'].split('/')[-1]
                    institution_name = institution['display_name']
                    institution_country_code = institution['country_code']
                    concept_name = work['concepts'][0]['display_name'] if 'concepts' in work and work['concepts'] else None
                    data.append({
                        'work_id': work['id'].split('/')[-1],
                        'work_title': work['title'],
                        'work_display_name': work['display_name'],
                        'work_publication_year': work['publication_year'],
                        'work_publication_date': work['publication_date'],
                        'author_id': author_id,
                        'author_name': author_name,
                        'author_position': author_position,
                        'institution_id': institution_id,
                        'institution_name': institution_name,
                        'institution_country_code': institution_country_code,
                        'concept_name': concept_name,
                    })
                    
df_works = pd.DataFrame(data)
df_works.to_csv('7&6ppg_works_since_2004.csv', index=False)

In [None]:
works_and_authors = {}            

for work in works:
    authors_list = []
    for authorship in work['authorships']:
        if authorship:
            author = authorship['author']
            author_id = author['id'].split('/')[-1] if author else None
            author_name = author['display_name'] if author else None
            authors_list.append({
                'author_id': author_id,
                'author_name': author_name,
            })
    works_and_authors[work['id'].split('/')[-1]] = authors_list
    
with open('7&6ppg_works_and_authors_since_2004.json', 'w') as f:
    json.dump(works_and_authors, f)

In [3]:
df = pd.read_csv('7&6ppg_works_since_2004.csv')
df.head()

Unnamed: 0,work_id,work_title,work_display_name,work_publication_year,work_publication_date,author_id,author_name,author_position,institution_id,institution_name,institution_country_code,concept_name
0,https://openalex.org/W1984712701,Performance Evaluation of Container-Based Virt...,Performance Evaluation of Container-Based Virt...,2013,2013-02-01,https://openalex.org/A5065379079,Miguel G. Xavier,first,https://openalex.org/I45643870,Pontifícia Universidade Católica do Rio Grande...,BR,Virtualization
1,https://openalex.org/W1984712701,Performance Evaluation of Container-Based Virt...,Performance Evaluation of Container-Based Virt...,2013,2013-02-01,https://openalex.org/A5062060864,Marcelo Veiga Neves,middle,https://openalex.org/I45643870,Pontifícia Universidade Católica do Rio Grande...,BR,Virtualization
2,https://openalex.org/W1984712701,Performance Evaluation of Container-Based Virt...,Performance Evaluation of Container-Based Virt...,2013,2013-02-01,https://openalex.org/A5075787478,Fábio Diniz Rossi,middle,https://openalex.org/I45643870,Pontifícia Universidade Católica do Rio Grande...,BR,Virtualization
3,https://openalex.org/W1984712701,Performance Evaluation of Container-Based Virt...,Performance Evaluation of Container-Based Virt...,2013,2013-02-01,https://openalex.org/A5009859711,Tiago Ferreto,middle,https://openalex.org/I45643870,Pontifícia Universidade Católica do Rio Grande...,BR,Virtualization
4,https://openalex.org/W1984712701,Performance Evaluation of Container-Based Virt...,Performance Evaluation of Container-Based Virt...,2013,2013-02-01,https://openalex.org/A5018576433,Timoteo Alberto Peters Lange,middle,https://openalex.org/I45643870,Pontifícia Universidade Católica do Rio Grande...,BR,Virtualization
