# Finding the edges: collaborative cryptocurrencies

In [1]:
# Libraries...
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patheffects as pe
import sys
import imp
%matplotlib inline

from notebooks_functions import *

In [2]:
# ###### 0.0 load github push/accepted-pull data
tmp = pd.read_pickle('export/git_data_pushpull_v4(vol1e5).pkl')
tmp1 =tmp.copy()
tmp1 = tmp1[tmp1.accepted]
tmp1 = tmp1[(tmp1.actor_login.notna())]
tmp1 = tmp1.reset_index(drop=True)

###### Generates EDGES dataframe
edges_list = []
group_tmp = tmp1.groupby('actor_login')
for actor in tmp1.drop_duplicates('actor_login').actor_login:
    tmp_g = group_tmp.get_group(actor)
    if len(tmp_g)>1:
        tmp_g = tmp_g.groupby(['sym', 'date']).count().reset_index()
        tmp_count = tmp_g.groupby(['sym']).count().reset_index().sort_values('sym')[['sym','id']]
        tmp_g = tmp_g.sort_values('date').drop_duplicates('sym').sort_values('sym')
        list1 = list(tmp_g.sym)
        list2 = list(tmp_g.sym)
        tmp_g = tmp_g.set_index('sym')
        tmp_count = tmp_count.set_index('sym')
        for org1 in list1:
            list2.remove(org1)
            for org2 in list2:
                if org1 != org2:
                    edges_list.append([org1,org2,tmp_count.loc[org1]['id'],tmp_count.loc[org2]['id'],
                                       actor,tmp_g.loc[[org1,org2]]['date'].max(),
                                       tmp_g.loc[[org1]]['date'].max(),tmp_g.loc[[org2]]['date'].max()])
df_edges = pd.DataFrame({'org1':[edges_list[i][0] for i in range(len(edges_list))],
                         'org2':[edges_list[i][1] for i in range(len(edges_list))],
                         'events1':[edges_list[i][2] for i in range(len(edges_list))],
                         'events2':[edges_list[i][3] for i in range(len(edges_list))],
                         'actors':[edges_list[i][4] for i in range(len(edges_list))],
                         'collaboration_begins':[edges_list[i][5] for i in range(len(edges_list))],
                         'org1_begins':[edges_list[i][6] for i in range(len(edges_list))],
                         'org2_begins':[edges_list[i][7] for i in range(len(edges_list))]})
actors_per_sym = tmp1.groupby(['sym','actor_login']).count().reset_index().groupby('sym').count()[['actor_login']].reset_index()

# Duplicate frame to count editors, actions, and events
df_edges_tmp1 = df_edges.groupby(['org1','org2']).count().reset_index()[['actors','org1','org2']].reset_index(drop=True)
df_edges_tmp2 = df_edges.groupby(['org1','org2']).sum().reset_index()[['org1','org2','events1','events2']].reset_index(drop=True)
df_edges_collaboration = df_edges.sort_values('collaboration_begins').copy()
df_edges_collaboration = df_edges_collaboration.drop_duplicates(['org1','org2'])[['org1','org2','collaboration_begins','org1_begins','org2_begins']].reset_index(drop=True)
### Sort the dataframes
df_edges.sort_values(['org1','org2','collaboration_begins'], inplace=True)
df_edges_tmp1.sort_values(['org1','org2'], inplace=True)
df_edges_tmp2.sort_values(['org1','org2'], inplace=True)
df_edges_collaboration.sort_values(['org1','org2'], inplace=True)
### Merge the 3 dataframe 
df_edges = df_edges.drop_duplicates(['org1','org2'],keep='first').reset_index(drop=True)
df_edges['index'] = df_edges_tmp1['actors'].reset_index(drop=True)
df_edges['coll_begins'] = df_edges_collaboration['collaboration_begins'].reset_index(drop=True)
df_edges['org1_begins'] = df_edges_collaboration['org1_begins'].reset_index(drop=True)
df_edges['org2_begins'] = df_edges_collaboration['org2_begins'].reset_index(drop=True)

df_edges['weight_events'] = (df_edges['events1']+df_edges['events2'])/2
df_edges.rename(columns={'index':'weights'}, inplace=True)
df_edges['weights'] = df_edges['weights']/df_edges['weights'].sum()
df_nodes = tmp.drop_duplicates(['actor_login','sym']).groupby('sym').count().sort_index().reset_index()[['sym','id']]
df_nodes = df_nodes.rename(columns={'sym':'org1','id':'weights'})
df_nodes.reset_index(inplace=True)
df_edges = df_nodes[['index','org1']].merge(df_edges, on='org1', how='inner')\
                                     .rename(columns={'index':'i1'})
df_edges = df_edges.merge(df_nodes[['index','org1']], left_on='org2', 
                                            right_on='org1', how='inner', suffixes=('','_y'))\
                                     .rename(columns={'index':'i2'})
df_edges = df_edges.sort_values(['i1','org1']).drop('org1_y', axis=1)
df_edges.reset_index(drop=True, inplace=True)




crypto_first_action = tmp1.sort_values('date').drop_duplicates('sym')[['sym','date']].reset_index(drop=True).rename(columns={'sym':'org','date':'first_action'})
df_edges = df_edges.merge(crypto_first_action, left_on='org1',right_on='org',how='left',suffixes=('','_org1'))
df_edges = df_edges.merge(crypto_first_action, left_on='org2',right_on='org',how='left',suffixes=('','_org2'))
## First_action_org1 is the first event date made by connecting editor on crypto1
df_edges = df_edges.rename(columns={'first_action':'first_action_org1'}).drop(['collaboration_begins','org','org_org2'],axis=1)

tmp = pd.read_pickle('export/git_data_v4(vol1e5).pkl')
tmp = tmp.sort_values('created_at')
coll_action_latter = []
org_latter = []
### first action (not necessarily push/pull) on second crypto
### which one is older (from a market perspective)
coll_action_former = []
org_former = []
for i,couple in df_edges.iterrows():
    tmp2 = tmp[(tmp.actor_login == couple.actors)&((tmp.sym==couple.org1)|(tmp.sym==couple.org2))]
    tmp2 = tmp2.drop_duplicates('sym')
    org_former.append(tmp2.iloc[0]['sym'])
    org_latter.append(tmp2.iloc[1]['sym'])
    coll_action_former.append(tmp2.iloc[0]['date'])
    coll_action_latter.append(tmp2.iloc[1]['date'])
    print(i,end='\r')
df_edges['org_former'] = org_former
df_edges['org_latter'] = org_latter
df_edges['coll_action_former'] = coll_action_former
df_edges['coll_action_latter'] = coll_action_latter

tmp = pd.read_pickle('export/git_data_pushpull_v4(vol1e5).pkl')
tmp = tmp.sort_values('created_at')
coll_action_latter = []
org_latter = []

### first push/pull on second crypto
coll_action_former = []
org_former = []
for i,couple in df_edges.iterrows():
    tmp2 = tmp[(tmp.actor_login == couple.actors)&((tmp.sym==couple.org1)|(tmp.sym==couple.org2))]
    tmp2 = tmp2.drop_duplicates('sym')
    org_former.append(tmp2.iloc[0]['sym'])
    org_latter.append(tmp2.iloc[1]['sym'])
    print(i,end='\r')
df_edges['org_formerPP'] = org_former
df_edges['org_latterPP'] = org_latter

df_edges.to_pickle('export/df_edges_all_length_Series_tmp.pkl')
df_nodes.to_pickle('export/df_nodes.pkl')


206

In [3]:
### Processing CrypoCompare data
# latest time at which data were downloaded
now = '20191101' ### insert present day
##### 0.1 preparing the yearly correlation between cryptos
##### 0.1.1- Process data market timeseries
### a crypto is considered to be active during a specific year if there are less than "threshold" Nan in its serie
threshold = 10
### 3 Options: first_action = based on first git-action of a crypto
###            org_begins   = based on first edit by connecting user
###            coin_age     = based on market time serie length
### Columns with which we want to sort the org1=PASSIVE and org2 =ACTIVE
### coll_begins  = first to edit is PASSIVE, second (at coll_begin) ACTIVE; 
### coin_age     = longhest serie is PASSIVE, shortest is ACTIVE
### first_action = oldest crypto with git-event is PASSIVE, the youngest is ACTIVE

# let pandas treat infinities as NaN
pd.options.mode.use_inf_as_na = True
### Load data... 
### consdider only time series for which we have market data (uncomment if wanted)
series = pd.read_pickle(f't_series_{now}.pkl')
series['time'] = pd.to_datetime(series.date)
series.rename(columns={'Close**':'price'}, inplace=True)
series.rename(columns={'Market Cap':'market cap'}, inplace=True)
series.rename(columns={'Volume':'volume'}, inplace=True)
series.sort_values('time',inplace=True)
series.loc[series.Low<0,'Low'] = np.nan
series.loc[series.High<0,'High'] = np.nan

# Evaluate the volatility per day as (max-min)/(max+min) del prezzo nella giornata
series['volatility'] = np.log(series['High']/series['Low'])
series.reset_index(drop=True,inplace=True)
series_keep = series.groupby('sym').count().reset_index()
coin_grouped_series = series.groupby('sym')

# Initialize the dataframe that will contain all the crypto timeseries
df_price = series.drop_duplicates('date')[['date']].reset_index(drop=True).rename(columns={'date':'time'})
df_mrcap = df_price.copy()
df_volum = df_price.copy()
df_volat = df_price.copy()
# Initialize the stable index to merge with
df_price_index = df_price.copy()
df_mrcap_index = df_price.copy()
df_volum_index = df_price.copy()
df_volat_index = df_price.copy()

# Cicle over all the grouped cryptocurrency timeseries
i=0
for group1 in coin_grouped_series:
    df_price[group1[0]] = df_price_index.merge(coin_grouped_series.get_group(group1[0])[['price','time']],left_on='time',right_on='time',how='left')['price']
    df_mrcap[group1[0]] = df_mrcap_index.merge(coin_grouped_series.get_group(group1[0])[['market cap','time']],left_on='time',right_on='time', how='left')['market cap']
    df_volum[group1[0]] = df_volum_index.merge(coin_grouped_series.get_group(group1[0])[['volume','time']],left_on='time',right_on='time', how='left')['volume']
    df_volat[group1[0]] = df_volat_index.merge(coin_grouped_series.get_group(group1[0])[['volatility','time']],left_on='time',right_on='time', how='left')['volatility']
    i+=1
    print(round(i/len(coin_grouped_series)*100,2),'%\t\t',end='\r')

###### Filtering cryptos with volume > 1e5 (already removed in notebook 1)
tmp_vol_filter = (df_volum.mean(axis=0)>1e5)
df_volum = df_volum.set_index('time')
df_volum = df_volum[[i for i in df_volum.columns if tmp_vol_filter.loc[i]==True]]
df_price = df_price.set_index('time')[df_volum.columns]
df_mrcap = df_mrcap.set_index('time')[df_volum.columns]
df_volat = df_volat.set_index('time')[df_volum.columns]


df_price['year'] = df_price.index.year
df_mrcap['year'] = df_mrcap.index.year
df_volum['year'] = df_volum.index.year
df_volat['year'] = df_volat.index.year
df_retur = df_price.copy()
df_mrret = df_mrcap.copy()
df_volumret = df_volum.copy()
df_volatret = df_volat.copy()
df_mrret = df_mrcap.copy()
df_retur = ((df_retur - df_retur.shift())/df_retur.shift())
df_mrret = ((df_mrret - df_mrret.shift())/df_mrret.shift())
df_volumret = ((df_volumret - df_volumret.shift())/df_volumret.shift())
df_volatret = ((df_volatret - df_volatret.shift())/df_volatret.shift())
df_retur['year'] = df_price.index.year
df_mrret['year'] = df_mrcap.index.year
df_volumret['year'] = df_volumret.index.year
df_volatret['year'] = df_volatret.index.year


df_price.to_pickle('export/df_price.pkl')
df_mrcap.to_pickle('export/df_mrcap.pkl')
df_volum.to_pickle('export/df_volum.pkl')
df_volat.to_pickle('export/df_volat.pkl')

df_retur.to_pickle('export/df_retur.pkl')
df_mrret.to_pickle('export/df_mrret.pkl')
df_volumret.to_pickle('export/df_volumret.pkl')
df_volatret.to_pickle('export/df_volatret.pkl')

df_edges = pd.read_pickle('export/df_edges_all_length_Series_tmp.pkl')
df_edges = df_edges[(df_edges.org1.isin(df_price.columns))&(df_edges.org2.isin(df_price.columns))]
df_edges.to_pickle('export/df_edges_all_length_Series.pkl')


100.0 %		

In [4]:
# ### Processing CoinGecko data
# # let pandas treat infinities as NaN
# pd.options.mode.use_inf_as_na = True
# ### Load data... 
# series = pd.read_pickle(f't_series_{now}_CG.pkl')
# series['time'] = pd.to_datetime(series.date)
# series.rename(columns={'Close**':'price'}, inplace=True)
# series.rename(columns={'Market Cap':'market cap'}, inplace=True)
# series.rename(columns={'Volume':'volume'}, inplace=True)
# series.sort_values('time',inplace=True)
# series.loc[series.Low<0,'Low'] = np.nan
# series.loc[series.High<0,'High'] = np.nan

# # Evaluate the volatility per day as (max-min)/(max+min) del prezzo nella giornata
# series['volatility'] = np.log(series['High']/series['Low'])
# series.reset_index(drop=True,inplace=True)
# series_keep = series.groupby('sym').count().reset_index()
# coin_grouped_series = series.groupby('sym')

# # Initialize the dataframe that will contain all the crypto timeseries
# df_price = series.drop_duplicates('date')[['date']].reset_index(drop=True).rename(columns={'date':'time'})
# df_mrcap = df_price.copy()
# df_volum = df_price.copy()
# df_volat = df_price.copy()
# # Initialize the stable index to merge with
# df_price_index = df_price.copy()
# df_mrcap_index = df_price.copy()
# df_volum_index = df_price.copy()
# df_volat_index = df_price.copy()

# # Cicle over all the grouped cryptocurrency timeseries
# i=0
# for group1 in coin_grouped_series:
#     df_price[group1[0]] = df_price_index.merge(coin_grouped_series.get_group(group1[0])[['price','time']],left_on='time',right_on='time',how='left')['price']
#     df_mrcap[group1[0]] = df_mrcap_index.merge(coin_grouped_series.get_group(group1[0])[['market cap','time']],left_on='time',right_on='time', how='left')['market cap']
#     df_volum[group1[0]] = df_volum_index.merge(coin_grouped_series.get_group(group1[0])[['volume','time']],left_on='time',right_on='time', how='left')['volume']
#     df_volat[group1[0]] = df_volat_index.merge(coin_grouped_series.get_group(group1[0])[['volatility','time']],left_on='time',right_on='time', how='left')['volatility']
#     i+=1
#     print(round(i/len(coin_grouped_series)*100,2),'%\t\t',end='\r')

# ###### Filtering cryptos with volume > 1e5 (already removed in notebook 1)
# tmp_vol_filter = (df_volum.mean(axis=0)>1e5)
# df_volum = df_volum.set_index('time')
# df_volum = df_volum[[i for i in df_volum.columns if tmp_vol_filter.loc[i]==True]]
# df_price = df_price.set_index('time')[df_volum.columns]
# df_mrcap = df_mrcap.set_index('time')[df_volum.columns]
# df_volat = df_volat.set_index('time')[df_volum.columns]

# # If volume greater than 1e5 times the gratest capitalization (i.e. BTC maximum mrcap value) then remove value
# df_volum = pd.DataFrame(data=np.where(df_volum<1e5*df_mrcap.max().max(),df_volum,np.NaN),columns=df_volum.columns,index=df_volum.index)
# # If mrcap greater than the BTC max mrcap then remove value
# df_mrcap = pd.DataFrame(data=np.where(df_mrcap<1.1*df_mrcap['BTC'].max(),df_mrcap,np.NaN),columns=df_mrcap.columns,index=df_mrcap.index)
# # ######

# df_price['year'] = df_price.index.year
# df_mrcap['year'] = df_mrcap.index.year
# df_volum['year'] = df_volum.index.year
# df_volat['year'] = df_volat.index.year
# df_retur = df_price.copy()
# df_mrret = df_mrcap.copy()
# df_volumret = df_volum.copy()
# df_volatret = df_volat.copy()
# df_mrret = df_mrcap.copy()
# df_retur = ((df_retur - df_retur.shift())/df_retur.shift())
# df_mrret = ((df_mrret - df_mrret.shift())/df_mrret.shift())
# df_volumret = ((df_volumret - df_volumret.shift())/df_volumret.shift())
# df_volatret = ((df_volatret - df_volatret.shift())/df_volatret.shift())
# df_retur['year'] = df_price.index.year
# df_mrret['year'] = df_mrcap.index.year
# df_volumret['year'] = df_volumret.index.year
# df_volatret['year'] = df_volatret.index.year


# df_price.to_pickle('export/df_price_CG.pkl')
# df_mrcap.to_pickle('export/df_mrcap_CG.pkl')
# df_volum.to_pickle('export/df_volum_CG.pkl')
# df_volat.to_pickle('export/df_volat_CG.pkl')

# df_retur.to_pickle('export/df_retur_CG.pkl')
# df_mrret.to_pickle('export/df_mrret_CG.pkl')
# df_volumret.to_pickle('export/df_volumret_CG.pkl')
# df_volatret.to_pickle('export/df_volatret_CG.pkl')



100.0 %				%		%		

In [5]:
##### Select collaborative cryptos
df_edges = pd.read_pickle('export/df_edges_all_length_Series.pkl')
df_price_all = pd.read_pickle('export/df_price.pkl')
df_mrcap_all = pd.read_pickle('export/df_mrcap.pkl')
df_trvol_all = pd.read_pickle('export/df_volum.pkl')

### Sort df_edges columns "org1" and "org2" based on different option values org1 -> active org2 -> passive
### Possible OPTIONS: coin_age, mr_cap, tr_volum, first_action, coll_begins
# ungroup_by = 'coin_age'
# df_edges = df_edges.apply(lambda x: decoupling_df_edges(x,ungroup_by),axis=1)

length = df_price_all.count().reset_index().rename(columns={'index':'org1',0:'len1'})
length = length[length.org1!='year']
df_edges = df_edges.merge(length, left_on='org1',right_on='org1')
length = df_price_all.count().reset_index().rename(columns={'index':'org2',0:'len2'})
df_edges = df_edges.merge(length, left_on='org2',right_on='org2')
df_edges['coin_age1'] = [df_price_all[df_edges.org1[i]][df_price_all[df_edges.org1[i]].notna()].index[0] for i in range(len(df_edges))]
df_edges['coin_age2'] = [df_price_all[df_edges.org2[i]][df_price_all[df_edges.org2[i]].notna()].index[0] for i in range(len(df_edges))]
df_edges['mean_mr_share1'] = df_mrcap_all[df_edges.org1].divide(df_mrcap_all.sum(axis=1),axis=0).mean().values
df_edges['mean_mr_share2'] = df_mrcap_all[df_edges.org2].divide(df_mrcap_all.sum(axis=1),axis=0).mean().values
df_edges['mean_tr_volum1'] = df_trvol_all[df_edges.org1].divide(df_trvol_all.sum(axis=1),axis=0).mean().values
df_edges['mean_tr_volum2'] = df_trvol_all[df_edges.org2].divide(df_trvol_all.sum(axis=1),axis=0).mean().values

### Average market share and volume during the connection 
### 'mean_width' is the size in days of the connection
mean_width = 7
mr_cap_ecology = df_mrcap_all.loc[df_edges.coll_begins[1]-pd.Timedelta(mean_width,'D'):df_edges.coll_begins[1]+pd.Timedelta(mean_width,'D')].mean().sum()
tr_vol_ecology = df_trvol_all.loc[df_edges.coll_begins[1]-pd.Timedelta(mean_width,'D'):df_edges.coll_begins[1]+pd.Timedelta(mean_width,'D')].mean().sum()
df_edges['mr_share1'] = [df_mrcap_all[df_edges.org1[i]].loc[df_edges.coll_begins[i]-pd.Timedelta(mean_width,'D'):df_edges.coll_begins[i]+pd.Timedelta(mean_width,'D')].mean()/mr_cap_ecology for i in range(len(df_edges))]
df_edges['mr_share2'] = [df_mrcap_all[df_edges.org2[i]].loc[df_edges.coll_begins[i]-pd.Timedelta(mean_width,'D'):df_edges.coll_begins[i]+pd.Timedelta(mean_width,'D')].mean()/mr_cap_ecology for i in range(len(df_edges))]
df_edges['tr_volum1'] = [df_trvol_all[df_edges.org1[i]].loc[df_edges.coll_begins[i]-pd.Timedelta(mean_width,'D'):df_edges.coll_begins[i]+pd.Timedelta(mean_width,'D')].mean()/tr_vol_ecology for i in range(len(df_edges))]
df_edges['tr_volum2'] = [df_trvol_all[df_edges.org2[i]].loc[df_edges.coll_begins[i]-pd.Timedelta(mean_width,'D'):df_edges.coll_begins[i]+pd.Timedelta(mean_width,'D')].mean()/tr_vol_ecology for i in range(len(df_edges))]
df_edges.fillna(0,inplace=True)
df_edges.to_pickle('export/df_edges_all_length_Series.pkl')


In [6]:
df_edges

Unnamed: 0,i1,org1,org2,events1,events2,actors,org1_begins,org2_begins,weights,coll_begins,...,coin_age1,coin_age2,mean_mr_share1,mean_mr_share2,mean_tr_volum1,mean_tr_volum2,mr_share1,mr_share2,tr_volum1,tr_volum2
0,2,ADA,DCR,12,1,jcmincke,2017-12-12,2017-09-21,0.003268,2017-12-12,...,2017-10-18,2016-02-10,0.012780,0.001062,0.000519,0.001216,0.899730,0.068825,0.257254,0.125234
1,31,BTC,DCR,1,359,jrick,2014-09-01,2016-02-09,0.016340,2016-02-09,...,2013-04-28,2016-02-10,0.741392,0.001062,0.598979,0.001216,0.869272,0.000017,0.642754,0.000738
2,5,AE,BTC,99,1,zack-bitcoin,2016-11-30,2014-04-20,0.003268,2016-11-30,...,2017-11-08,2013-04-28,0.000975,0.741392,0.000203,0.598979,0.000000,1.786991,0.000000,0.689184
3,8,AMB,BTC,13,4,robertsdotpm,2017-11-26,2014-04-17,0.003268,2017-11-26,...,2017-10-28,2013-04-28,0.000142,0.741392,0.000071,0.598979,0.004397,24.019760,0.003965,25.179334
4,18,BCH,BTC,1,7,justusranvier,2017-10-27,2013-06-05,0.003268,2017-10-27,...,2017-08-02,2013-04-28,0.040076,0.741392,0.051801,0.598979,1.033901,15.517184,2.899567,18.843176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,166,NEO,ONT,174,1,Celia18305,2017-10-23,2019-02-21,0.003268,2019-02-21,...,2016-09-09,2018-03-22,0.005631,0.003337,0.009368,0.001633,0.085009,0.070771,0.221416,0.057824
203,183,PART,TIX,128,30,rynomster,2017-04-18,2017-01-25,0.003268,2017-04-18,...,2017-09-16,2017-12-08,0.000206,0.000034,0.000235,0.000074,0.000000,0.000000,0.000000,0.000000
204,197,PPC,XPM,4,7,sunnyking,2017-02-28,2018-07-09,0.003268,2018-07-09,...,2014-07-07,2014-11-23,0.001762,0.000205,0.000614,0.000116,0.005332,0.004530,0.003639,0.001488
205,226,SNT,WPR,1,15,Janther,2017-06-11,2017-07-13,0.003268,2017-07-13,...,2017-07-29,2018-02-13,0.000757,0.000061,0.000555,0.000140,0.000000,0.000000,0.000000,0.000000


### Random edges generation -> generating pairs for 3 different baseline

In [1]:
### Edges generator
import numpy as np
import pandas as pd
import itertools

measure_serie = 'retur'
### Generates 3 datasets with edges:
df_measure = pd.read_pickle('export/df_{}.pkl'.format(measure_serie))
df_measure = df_measure.drop('year',axis=1)
conn_couples = pd.read_pickle('export/df_edges_all_length_Series.pkl')
conn_couples = conn_couples[conn_couples.coll_begins>df_measure.first_valid_index()]
### 1- connected/collaborative couples ('conn_couples.pkl')
conn_couples.to_pickle('conn_couples.pkl')
conn_couples.to_pickle('conn_couples_safe.pkl')

### 2- N random couples with connection time distributed as the connection_time distribution ('rand_conn_time_couples.pkl')
# Select the desired time period from market data
N = 1e5
Rcouples = conn_couples[['org1','org2','coll_begins']].sample(int(N),replace=True).reset_index(drop=True)
Rcouples = Rcouples.rename(columns={'org1':'original_org1','org2':'original_org2'})
Rcouples['org1'],Rcouples['org2'] = np.transpose([list(np.random.choice(df_measure.columns,2,replace=False)) for i in range(int(N))])
Rcouples['coll_action_latter'] = conn_couples['coll_action_latter'].sample(int(N),replace=True).reset_index(drop=True)
Rcouples = Rcouples[(Rcouples.org1!=Rcouples.original_org1)&(Rcouples.org1!=Rcouples.original_org2)&\
                    (Rcouples.org2!=Rcouples.original_org1)&(Rcouples.org2!=Rcouples.original_org2)].reset_index(drop=True)
Rcouples.to_pickle('rand_conn_time_couples.pkl')
Rcouples.to_pickle('rand_conn_time_couples_safe.pkl')


### 3- N random coupling as in 2 plus age condition ('rand_conn_time_and_age_couples.pkl')
###    age condition: sampled cryptos must have the same age of the coupled one (in a "tolerance" range)
con_dates = conn_couples.coll_begins
tolerance = 7 # difference in days from age to which accept  the other candidates
### List the ages of the connected cryptos (for each row we have 2 ages)
ages_latter = []
ages_former = []
crypto_latter = []
crypto_former = []
first_indexes = [df_measure[col].first_valid_index() for col in df_measure.columns]
columns = list(df_measure.columns)
v_index = pd.DataFrame(index=columns,data={'indexes':first_indexes})
old = []
new = []
nold = []
nnew = []
nageold = []
nagenew = []


for i, couple in conn_couples.iterrows():
    if couple.coin_age1>couple.coin_age2:
        old_crypto = couple.org2
        old_enters = couple.coin_age2
        new_crypto = couple.org1
        new_enters = couple.coin_age1
    else:
        old_crypto = couple.org1
        old_enters = couple.coin_age1
        new_crypto = couple.org2
        new_enters = couple.coin_age2
    nold.append(old_crypto)
    nnew.append(new_crypto)
    nageold.append(old_enters)
    nagenew.append(new_enters)
    org_market_entrance = v_index.drop([couple.org1,couple.org2])
    tmp = org_market_entrance[(org_market_entrance.indexes<couple.coin_age1+pd.Timedelta(tolerance,'days'))&(org_market_entrance.indexes>couple.coin_age1-pd.Timedelta(tolerance,'days'))&(org_market_entrance.indexes<couple.coin_age2+pd.Timedelta(tolerance,'days'))&(org_market_entrance.indexes>couple.coin_age2-pd.Timedelta(tolerance,'days'))]
    ## If a crypto is age-compatible with both old and new we randomly assign it to the old or new group
    if len(tmp)>0:
        old_new_num = round(np.random.uniform(0,len(tmp)))
        old_rand = list(np.random.choice(list(tmp.index), old_new_num))
        new_rand = [cpt for cpt in tmp.index if not cpt in old_rand]
    else:
        old_rand = []
        new_rand = []
    org_market_entrance = org_market_entrance.drop(tmp.index)
    old_row = list(org_market_entrance[(org_market_entrance.indexes<couple.coin_age1+pd.Timedelta(tolerance,'days'))&(org_market_entrance.indexes>couple.coin_age1-pd.Timedelta(tolerance,'days'))].index)
    new_row = list(org_market_entrance[(org_market_entrance.indexes<couple.coin_age2+pd.Timedelta(tolerance,'days'))&(org_market_entrance.indexes>couple.coin_age2-pd.Timedelta(tolerance,'days'))].index)
    old_row = old_row+old_rand
    new_row = new_row+new_rand
    if (len(old_row)>0)&(len(new_row)>0):
        old.append(old_row)
        new.append(new_row)
    else:
        old.append([])
        new.append([])

## in this case org1 is also the oldest one (using the market entrance date)
new_conn_couples = pd.DataFrame({'org_old':nold,'org_new':nnew,'age_old':nageold,'age_new':nagenew,'coll_begins':conn_couples.coll_begins,'coll_action_latter':conn_couples.coll_action_latter})
org_old = []
org_new = []
age_old = []
age_new = []
coll_be = []
coll_lat = []
org1 = []
org2 = []

for i in range(len(old)):
        s_old = old[i]
        s_new = new[i]
        s_con = conn_couples.iloc[i].coll_begins
        s_col = conn_couples.iloc[i].coll_action_latter
        if (len(s_old)>0)&(len(s_new)>0):
            for old1, new1 in itertools.product(s_old,s_new):
                if (old1!=new_conn_couples.iloc[i].org_old)&(new1!=new_conn_couples.iloc[i].org_new):
                    org_old.append(old1)
                    org_new.append(new1)
                    coll_be.append(s_con)
                    coll_lat.append(s_col)
                    age_old.append(v_index.loc[old1][0])
                    age_new.append(v_index.loc[new1][0])
                    org1.append(new_conn_couples.iloc[i].org_old)
                    org2.append(new_conn_couples.iloc[i].org_new)
RATcouples = pd.DataFrame({'org1':org_old,'org2':org_new,'age_old':age_old,'age_new':age_new,'coll_begins':coll_be,'coll_action_latter':coll_lat,'original_org1':org1,'original_org2':org2})
RATcouples = RATcouples.sample(int(N),replace=True).reset_index(drop=True)
RATcouples = RATcouples[(RATcouples.org1!=RATcouples.original_org1)&(RATcouples.org1!=RATcouples.original_org2)&\
                    (RATcouples.org2!=RATcouples.original_org1)&(RATcouples.org2!=RATcouples.original_org2)].reset_index(drop=True)
RATcouples.to_pickle('rand_conn_time_and_age_couples.pkl')
RATcouples.to_pickle('rand_conn_time_and_age_couples_safe.pkl')