In [5]:
import itertools
import random
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path 
import os
import sys
from IPython.display import clear_output
import logging
import threading
import time

In [2]:
YEARS = [str(year) for year in range(1990,2019)]

## Computer science authors in France
collaboration_df = pd.read_csv('myDATA/00-collaboration_df.csv')
collaboration_df

Unnamed: 0,ID,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
12908,8958327900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6508297663,0,0,0,0,0,0,0,0,0,...,4,7,7,8,8,8,8,8,8,8
103257,34571759000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,5,5,5
25815,7004267341,0,0,0,0,0,0,0,0,0,...,10,10,10,16,16,16,16,16,16,16
64536,8642393600,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12222,6507630481,0,0,0,0,0,0,0,0,0,...,18,18,18,18,18,29,29,29,29,29
193310,24577815500,0,0,0,0,0,0,0,0,0,...,4,4,4,4,6,13,16,16,16,70
36974,57195243976,0,0,0,0,0,0,0,0,0,...,0,3,3,3,3,3,3,3,8,8
12243,35328962100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,2,2,3


In [3]:
## retrieve the pubblication data(not collaborations)
file = '/home/leonardo/Desktop/PFE/Data/requested_data_on_authors.json'
d = json.load(open(Path(file)))

In [4]:
# Build a pandas dataframe from it
publications_df = pd.DataFrame.from_dict(d, orient='index')

In [5]:
# format it as callaboration_df
publications_df = publications_df.reset_index()
publications_df = publications_df.rename(columns={"index": "ID"})
publications_df['ID'] = publications_df['ID'].astype(int)

# keep only authors present in the collaborations dataframe
collab_IDs = collaboration_df["ID"]
publications_df = publications_df[publications_df.ID.isin(collab_IDs)]
publications_df

Unnamed: 0,ID,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
5,7003355588,2,2,2,1,4,0,5,5,0,...,7,4,4,15,11,7,11,9,8,6
29,56522848500,3,0,1,0,2,0,6,1,3,...,3,5,6,1,0,0,1,1,1,4
49,7004165433,5,1,1,2,10,5,6,2,6,...,4,3,11,7,6,10,6,3,3,4
79,6603870889,1,0,2,0,1,2,6,4,2,...,8,10,7,20,16,12,9,10,15,16
96,7005944861,10,10,3,7,8,8,4,15,9,...,9,8,12,10,20,19,17,12,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207020,57200496797,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2207039,15137130100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2207045,57196721826,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2207048,57196401698,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
## Find the starting collaboration year for each author
def get_start_Y(df, id):
    aut = df.loc[df["ID"] == id]
    start_year = 0
    for year in aut.columns[1:]:
        n_colls = aut[year].values[0]
        if(n_colls != 0 and start_year==0):
            start_year = year
    return start_year

In [7]:
# Filter inactive authors: no holes of given lenght in publication history
def filter_inactives(df, hole_lenght): 
    
    i=0
    tot=len(df["ID"])
    
    for id in df["ID"]:
        aut = df.loc[df["ID"] == id]
        ID = aut["ID"].values[0]
        start_y = int(get_start_Y(df, id))
        active = True

        if(start_y == 0):
            active = False
        
        # slide the hole window 
        while(start_y + hole_lenght-1 < 2018):            
            c = 0
            for j in range(0, hole_lenght):
                if(aut[str(start_y + j)].values[0] == 0):
                    c += 1
            if(c>=hole_lenght-1):
                active = False
                
            start_y += 1
        
        if(active == False):
            df = df[df['ID'] != ID]

        clear_output(wait=True)
        
        print(i, "/", tot, "for hole_lenght", hole_lenght-1)
        i += 1       
        

    for i in df.index:
        df.loc[i].to_json()

    return df

In [11]:
hole_lenght = 28
min_hole_l = 0

In [None]:
## filter out all inactive authors from the collaboration datasset, for different hole lenght
tmp_pub_df = publications_df
while(hole_lenght>min_hole_l):
          
    tmp_pub_df = filter_inactives(tmp_pub_df, hole_lenght)
    active_IDs = tmp_pub_df["ID"]
    
    df = collaboration_df[collaboration_df.ID.isin(active_IDs)]
    df.to_csv('myDATA/05-filtered_by_hole_size/filtered_by_hole_size_'+str(hole_lenght-1)+'.csv', index=False)
    
    hole_lenght-=1

In [11]:
# remove empty authors
min_size = 0
max_size = 28
YEARS = [str(year) for year in range(1991,2019)]  

# Read active authors by hole lenght
for size in range(min_size, max_size+1):
    path = 'myDATA/05-filtered_by_hole_size/filtered_by_hole_size_'+str(size)+'.csv'
    df = pd.read_csv(path)
    
    ids = df.loc[df['2018'] == 0]['ID'].index.values
    df = df.drop(ids)
    df.to_csv(path, index=False)

# Read active authors by hole lenght 
for size in range(min_size, max_size+1):
    path = 'myDATA/05-filtered_by_hole_size/filtered_by_hole_size_'+str(size)+'.csv'
    if os.path.exists(path):
        df = pd.read_csv(path)
        print("hole size", size, "-> # authors", len(df))

hole size 0 -> # authors 11404
hole size 1 -> # authors 35889
hole size 2 -> # authors 62438
hole size 3 -> # authors 86085
hole size 4 -> # authors 105735
hole size 5 -> # authors 122318
hole size 6 -> # authors 136654
hole size 7 -> # authors 148926
hole size 8 -> # authors 159758
hole size 9 -> # authors 169539
hole size 10 -> # authors 178506
hole size 11 -> # authors 186114
hole size 12 -> # authors 192935
hole size 13 -> # authors 198503
hole size 14 -> # authors 202980
hole size 15 -> # authors 206973
hole size 16 -> # authors 210453
hole size 17 -> # authors 213440
hole size 18 -> # authors 216175
hole size 19 -> # authors 218790
hole size 20 -> # authors 221257
hole size 21 -> # authors 223305
hole size 22 -> # authors 225063
hole size 23 -> # authors 226452
hole size 24 -> # authors 227553
hole size 25 -> # authors 228547
hole size 26 -> # authors 229230
hole size 27 -> # authors 229824
hole size 28 -> # authors 230234
