In [87]:
import pandas as pd
import numpy as np
import json
import networkx as nx
import plotly.express as px

# Data

In [88]:
# Read file: selected participants
input_arg = json.loads(open('../input/participants.json').read())
input_args = json.loads(open('../input/simulation.json').read())
selected_participants = input_arg['participants']

In [89]:
file = "../data/Fitbit_Imputation_FINAL.csv"
df_pal = pd.read_csv(file, sep=';', header=0, encoding='latin-1')
df_pal = df_pal[df_pal['Class'].isin(input_args['classes'])]

df_pal = df_pal.groupby(['Child', 'Class', 'Wave']).mean()['Steps'].reset_index()
# normalize the number of steps: divided by 10,000
df_pal.Steps = df_pal.Steps * 0.0001
df_pal


Unnamed: 0,Child,Class,Wave,Steps
0,1605,81,1,0.659440
1,1605,81,2,0.536140
2,1605,81,3,0.411220
3,1605,81,4,0.645267
4,1605,81,5,0.674440
...,...,...,...,...
1809,6090,303,6,0.278633
1810,6090,303,7,0.777800
1811,6186,272,5,0.596733
1812,6186,272,6,1.012667


In [90]:

df_pal = df_pal.pivot(index='Child', columns='Wave')['Steps']
df_pal


Wave,1,2,3,4,5,6,7
Child,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1605,0.659440,0.536140,0.411220,0.645267,0.674440,0.839840,0.779367
1606,0.685767,0.783220,0.717233,0.414233,0.864100,0.892600,1.071067
1607,0.996140,1.188520,0.771050,0.631375,1.042980,0.172825,1.245550
1608,1.143800,1.130540,0.813220,1.056350,0.782900,0.832433,1.001500
1609,0.555740,0.416067,0.593700,0.581880,0.627525,0.778840,0.413933
...,...,...,...,...,...,...,...
6086,,,,,0.837500,0.836200,0.787200
6088,,,,,0.721320,0.818840,0.850733
6089,,,,,0.765500,0.702440,0.834733
6090,,,,,1.038680,0.278633,0.777800


#### Peer-nomination network

In [91]:
# Read file
nom_data = '../data/W5_Sociometric_long.csv'
df_nom = pd.read_csv(nom_data, sep=';', header=0)

df_nom['Child'].unique()


array([1401, 1402, 1406, 1408, 1409, 1410, 1414, 1420, 1421, 1422, 1423,
       1424, 1428, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613,
       1614, 1615, 1616, 1617, 1619, 1620, 1621, 1622, 1623, 1625, 1626,
       1627, 1628, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1639,
       1640, 1641, 1642, 1644, 1645, 1648, 1649, 1650, 1651, 1652, 1653,
       1654, 1655, 1656, 1659, 1660, 1664, 1665, 1666, 1668, 1672, 1675,
       1717, 1719, 1723, 1730, 1733, 1736, 1739, 1807, 1809, 1810, 1812,
       1814, 1815, 1817, 1818, 1820, 1821, 1823, 2262, 2263, 2265, 2266,
       2267, 2268, 2270, 2271, 2272, 2275, 2276, 2277, 2278, 2280, 2281,
       2282, 2283, 2337, 2339, 2341, 2342, 2343, 2344, 2345, 2346, 2347,
       2350, 2351, 2352, 2858, 2859, 2860, 2862, 2863, 2865, 2866, 2868,
       2869, 2872, 2873, 2875, 2876, 2877, 2879, 2880, 2882, 2883, 2885,
       2886, 2887, 2890, 2891, 2892, 2894, 2897, 2898, 2899, 2900, 2901,
       2902, 2903, 2904, 2905, 2906, 2908, 3145, 31

In [92]:
# Only include selected participants (see selection_participants.ipynb)
df_nom = df_nom[df_nom.Child.isin(selected_participants)]

# Only include relevant questions
questions = ['GEN_Advice', 'GEN_Leader', 'GEN_Social_Facilitation','GEN_Want2B']
df_nom = df_nom[df_nom.Variable.isin(questions)]



#### Online communication network

In [93]:
# Read file
primary_school = '../data/Buzz_W5_primary-schools.csv'
secondary_school = '../data/Buzz_W5_secondary-schools.csv'
df_com = pd.read_csv(primary_school, sep=';', header=0, encoding='latin-1')
df_com = df_com.append(pd.read_csv(secondary_school, sep=';', header=0, encoding='latin-1'))
df_com = df_com[df_com.Type=='my']

#
df_com['Participant'].unique()


Columns (22) have mixed types.Specify dtype option on import or set low_memory=False.



array(['3410302337', '3410302341', '3410302342', '3410302343',
       '3410302344', '3410302345', '3410302346', '3410302351',
       '3410302352', '3410304875', '3410304929', '3425004905',
       '3425004906', '3425004907', '3425004908', '3425004909',
       '3425004911', '3425004913', '3425104915', '3425104916',
       '3425104918', '3425104919', '3425104920', '3425104921',
       '3425104922', '3425104923', '3425104924', '3425104925',
       '3425104926', '3425104927', '5629605863', '5629605864',
       '5629605865', '5629605867', '5629605868', '5629605869',
       '5629605871', '5629605872', '5629605874', '5629605875',
       '5629605878', '5629605879', '5629605880', '5629605882',
       '5629605885', '5629605886', '5629605887', '5629705893',
       '5629705894', '5629705897', '5629705898', '5629705901',
       '5629705905', '5629705906', '5629705907', '5629705911',
       '5629705914', '5629705915', '5629705916', '5629705918',
       '5629705919', '5629705920', '5629705922', '30082

In [94]:
# Only include selected participants (see selection_participants.ipynb)
df_com = df_com[df_com.Child.isin(selected_participants)]

df_com

Unnamed: 0,Participant,School,Class,Child,Gender,Age,PI,Date,Time,UMID,...,ShareSource,SharedVia,Number_likes,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
9508,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:14,59677,...,,,0,,,,,,,
9509,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:15,59689,...,,,0,,,,,,,
9510,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:15,59693,...,,,0,,,,,,,
9511,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:15,59695,...,,,0,,,,,,,
9514,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:32,59909,...,,,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7879,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,8-3-2018,16:41,98196,...,,,0,,,,,,,
7880,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,8-3-2018,16:41,98200,...,,,0,,,,,,,
7881,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,9-3-2018,7:10,100592,...,,,0,,,,,,,
7882,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,9-3-2018,7:11,100596,...,,,0,,,,,,,


#### Age

In [95]:
# Mean age
class_df_com = df_com[['Child', 'Class', 'Age']].drop_duplicates()
class_df_com.groupby(['Class'])['Age'].describe()

#class_df_com['Age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,18.0,11.555556,0.51131,11.0,11.0,12.0,12.0,12.0
81.0,19.0,11.473684,0.772328,11.0,11.0,11.0,12.0,14.0
100.0,19.0,11.473684,0.512989,11.0,11.0,11.0,12.0,12.0
124.0,17.0,11.294118,0.469668,11.0,11.0,11.0,12.0,12.0
125.0,16.0,11.5625,0.629153,11.0,11.0,11.5,12.0,13.0
256.0,22.0,12.181818,0.394771,12.0,12.0,12.0,12.0,13.0
258.0,16.0,12.4375,0.629153,11.0,12.0,12.5,13.0,13.0
261.0,17.0,10.294118,0.469668,10.0,10.0,10.0,11.0,11.0
262.0,15.0,10.6,0.632456,10.0,10.0,11.0,11.0,12.0
263.0,21.0,10.47619,0.511766,10.0,10.0,10.0,11.0,11.0


#### Sex

In [96]:
# Count Female
class_df_com = df_com[['Child', 'Class', 'Gender']].drop_duplicates()
class_df_com.groupby(['Class']).sum()['Gender'].reset_index()



Unnamed: 0,Class,Gender
0,78.0,10.0
1,81.0,10.0
2,100.0,7.0
3,124.0,11.0
4,125.0,9.0
5,256.0,12.0
6,258.0,10.0
7,261.0,6.0
8,262.0,7.0
9,263.0,13.0



#### Physical activity level (PAL)



In [97]:
# Read file: PAL
pal_file = "../data/Fitbit_Imputation_FINAL.csv"

df_pal = pd.read_csv(pal_file, sep=';', header=0, encoding='latin-1')

# select wave 5
df_pal2 = df_pal[df_pal['Wave']==5]

# calculate mean steps
df_pal2 = df_pal2.groupby(['Child', 'Wave']).mean()['Steps'].reset_index()
df_pal2['Steps_scaled'] = df_pal2.Steps * 0.0001 #0.000153

# select participants
df_pal3 = df_pal2[df_pal2.Child.isin(selected_participants)]

# merge with peer-nomination network and online communication data
df_nom2 = pd.merge(df_nom, df_pal2, on='Child')
df_com2 = pd.merge(df_com, df_pal2, on='Child')

df_pal3.pivot(index='Child', columns='Wave')['Steps_scaled']

# Mean PAL by class: use df_com3, not df_nom3, because 2 participants in df_nom3 have no contacts.
class_df_com = df_com2[['Child', 'Class','Steps_scaled']].drop_duplicates()
class_df_com.groupby(['Class'])['Steps_scaled'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,18.0,0.87016,0.286393,0.507625,0.630245,0.821665,1.036875,1.413375
81.0,19.0,0.794184,0.212885,0.185367,0.686087,0.79814,0.8889,1.1432
100.0,19.0,0.877027,0.340317,0.2094,0.6992,0.94292,1.06757,1.43036
124.0,17.0,0.990502,0.299465,0.52276,0.76878,1.058133,1.2123,1.54082
125.0,16.0,0.98172,0.246594,0.531433,0.805988,0.98827,1.17862,1.4047
256.0,22.0,0.785191,0.322836,0.34905,0.570335,0.696653,0.963208,1.86738
258.0,16.0,1.01516,0.294962,0.55075,0.80043,1.06916,1.201885,1.47884
261.0,17.0,1.041116,0.257415,0.59844,0.84146,1.06765,1.214633,1.421267
262.0,15.0,0.928219,0.306408,0.54215,0.669265,0.887325,1.119133,1.56176
263.0,20.0,0.80507,0.175668,0.468967,0.705521,0.812998,0.917613,1.115


In [98]:
# Overall Mean PAL score
class_df_com['Steps_scaled'].describe()

count    408.000000
mean       0.919938
std        0.313788
min        0.122750
25%        0.704670
50%        0.911460
75%        1.114670
max        1.867380
Name: Steps_scaled, dtype: float64

#### Family Affluence Score (FAS)

In [99]:
# Read file: environmental score
env_file = "../data/W6_Main_Questions.csv"
df_env = pd.read_csv(env_file, sep=';', header=0, encoding='latin-1')

# select FAS questions
df_env = df_env[['Child','GEN_FAS_computer_A01' ,'GEN_FAS_car_A01', 'GEN_FAS_vacation_A01', 'GEN_FAS_ownroom_A01']]

# calculate FAS score
df_env['FAS_score'] = df_env['GEN_FAS_computer_A01'] + df_env['GEN_FAS_vacation_A01'] + df_env['GEN_FAS_car_A01']*1.5 + df_env['GEN_FAS_ownroom_A01']*3

# to keep the values between 0 and 2.
df_env['FAS_scaled'] = abs((df_env.FAS_score/6) - 2)

# select participants
df_env2 = df_env[df_env.Child.isin(selected_participants)]

print("Number of participants with missing FAS score:", np.isnan(df_env2['FAS_scaled']).sum())

# missing will be replaced by overall average value
mean_FAS_scaled = df_env2['FAS_scaled'].mean()
print("Mean FAS_scaled:",mean_FAS_scaled) # different than original because 518 participants instead of 408
mean_FAS_score = df_env2['FAS_score'].mean()
print("Mean FAS_score:",mean_FAS_score) # different than original because 518 participants instead of 408
for i in df_env2.index:
    if np.isnan(df_env2.at[i,'FAS_scaled']):
        df_env2.at[i,'FAS_scaled'] = mean_FAS_scaled
    if np.isnan(df_env2.at[i,'FAS_score']):
        df_env2.at[i,'FAS_score'] = mean_FAS_score

df_env3 = df_env2[['Child', 'FAS_scaled', 'FAS_score']]

# merge with peer-nomination network and online communication data
df_nom3 = pd.merge(df_nom2, df_env3, on='Child')
df_com3 = pd.merge(df_com2, df_env3, on='Child')

# Mean FAS by class: use df_com3, not df_nom3, because 2 participants in df_nom3 have no contacts.
class_df_com = df_com3[['Child', 'Class','FAS_scaled', 'FAS_score']].drop_duplicates()
class_df_com.groupby(['Class'])['FAS_scaled', 'FAS_score'].describe()


Columns (319,321,323,377,434,436,439,441,444,446,449,451,454,456,459,461,464,466,469,471,474,476,479,481,484,486,489,491) have mixed types.Specify dtype option on import or set low_memory=False.


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Number of participants with missing FAS score: 41
Mean FAS_scaled: 0.48819255222524977
Mean FAS_score: 9.0708446866485


Unnamed: 0_level_0,FAS_scaled,FAS_scaled,FAS_scaled,FAS_scaled,FAS_scaled,FAS_scaled,FAS_scaled,FAS_scaled,FAS_score,FAS_score,FAS_score,FAS_score,FAS_score,FAS_score,FAS_score,FAS_score
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
78.0,18.0,0.588657,0.363662,0.0,0.434548,0.488193,0.854167,1.5,18.0,8.468059,2.181969,3.0,6.875,9.070845,9.392711,12.0
81.0,19.0,0.478727,0.269947,0.0,0.333333,0.488193,0.583333,1.166667,19.0,9.127635,1.619681,5.0,8.5,9.070845,10.0,12.0
100.0,19.0,0.564547,0.407132,0.0,0.333333,0.5,0.625,1.666667,19.0,8.61272,2.442793,2.0,8.25,9.0,10.0,12.0
124.0,17.0,0.528023,0.344927,0.0,0.333333,0.488193,0.75,1.083333,17.0,8.831864,2.069563,5.5,7.5,9.070845,10.0,12.0
125.0,16.0,0.495572,0.263589,0.0,0.395833,0.488193,0.583333,1.166667,16.0,9.026567,1.581536,5.0,8.5,9.070845,9.625,12.0
256.0,22.0,0.372853,0.174295,0.0,0.270833,0.416667,0.497048,0.583333,22.0,9.762881,1.045771,8.5,9.017711,9.5,10.375,12.0
258.0,16.0,0.321441,0.332346,0.0,0.0,0.333333,0.434548,1.25,16.0,10.071356,1.994078,4.5,9.392711,10.0,12.0,12.0
261.0,17.0,0.460784,0.255355,0.0,0.333333,0.5,0.583333,0.833333,17.0,9.235294,1.532131,7.0,8.5,9.0,10.0,12.0
262.0,15.0,0.470648,0.298125,0.0,0.333333,0.488193,0.541667,1.0,15.0,9.176113,1.788751,6.0,8.75,9.070845,10.0,12.0
263.0,20.0,0.38691,0.25946,0.0,0.333333,0.333333,0.5,1.083333,20.0,9.678542,1.556762,5.5,9.0,10.0,10.0,12.0


In [100]:
# Overall Mean FAS score
class_df_com['FAS_score'].describe()

count    408.000000
mean       9.070845
std        1.988702
min        2.000000
25%        8.000000
50%        9.070845
75%       10.000000
max       12.000000
Name: FAS_score, dtype: float64

In [101]:
# Overall Mean FAS scaled
class_df_com['FAS_scaled'].describe()

count    408.000000
mean       0.488193
std        0.331450
min        0.000000
25%        0.333333
50%        0.488193
75%        0.666667
max        1.666667
Name: FAS_scaled, dtype: float64

# Create peer-nomination network

In [102]:
# Only select alters present in the class of the child (i.e. participant)
df_nom4 = []
for cl in df_nom3['Class'].unique():
    df_temp = df_nom3[df_nom3.Class.isin([cl])]
    children_in_class = df_temp.Child.unique()
    df_temp = df_temp[df_temp.Alter.isin(children_in_class)]
    df_nom4.append(df_temp)

df_nom4 = pd.concat(df_nom4)
df_nom3.Child.unique()

array([1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1614, 1615, 1617,
       1620, 1623, 1625, 1626, 1627, 1630, 1631, 1632, 1633, 1636, 1637,
       1639, 1640, 1641, 1642, 1645, 1651, 1653, 1654, 1656, 1717, 1719,
       1723, 1733, 1736, 2262, 2263, 2265, 2266, 2267, 2268, 2270, 2271,
       2275, 2276, 2277, 2278, 2280, 2281, 2282, 2283, 2858, 2859, 2860,
       2862, 2863, 2865, 2866, 2868, 2869, 2872, 2873, 2875, 2877, 2880,
       2882, 2883, 2885, 2886, 2887, 2890, 2891, 2894, 2897, 2898, 2899,
       2900, 2901, 2902, 2903, 2904, 2905, 2906, 2908, 3317, 4364, 4425,
       4427, 5011, 5013, 5014, 5015, 5016, 5017, 5018, 5020, 5021, 5022,
       5023, 5024, 5027, 5028, 5029, 5031, 5032, 5033, 5034, 5038, 5039,
       5040, 5070, 5071, 5075, 5076, 5078, 5080, 5081, 5083, 5084, 5085,
       5087, 5092, 5094, 5095, 5096, 5099, 5157, 5159, 5161, 5162, 5163,
       5164, 5165, 5166, 5167, 5168, 5169, 5170, 5174, 5176, 5177, 5179,
       5181, 5182, 5184, 5187, 5189, 5190, 5191, 51

In [103]:
# Create the edges in the graph
peer_nominations_network = nx.DiGraph()

# Read questions to use
formula = json.loads(open('../input/connections_gen.json').read())

# Sum of all questions
max_score = sum(formula.values())

# Create a dictionary with the connections and weights
connections_dict = {}
for child in selected_participants:
    connections_dict[child] = {}

# create network based on unique child ids
for cl in df_nom3['Class'].unique():
    children = df_nom3[df_nom3.Class.isin([cl])].Child.unique()
    for key in children:
        peer_nominations_network.add_node(key)

# To avoid repetition of nominations in different waves
nominations_list = []
for line in df_nom4[['Child', 'Alter', 'Variable']].iterrows():
    (child, alter, var) = line[1]
    # Verify if nominated is in the list of participants (pp)
    if alter in selected_participants and (child, alter, var) not in nominations_list:
        # Add value in the key
        connections_dict[child][alter] = connections_dict[child].get(alter, 0) + 1*formula[var]
        nominations_list.append((child, alter, var))

# Make a dataframe and normalize the values for the edges
connections_df = pd.DataFrame(connections_dict).fillna(0)/max_score
connections_dict = connections_df.to_dict()


df_connections_nomination = []
for node in connections_dict.items():
    child = node[0]
    alter = node[1]
    for alt, weight in alter.items():
        if weight > 0:
            peer_nominations_network.add_edge(child, alt, weight=weight)
            df_connections_nomination.append([child, alt, weight])


df_connections_nomination = pd.DataFrame(df_connections_nomination, columns = ["ChildID", "AlterID", "Weight"])

class_df = df_nom3[['Child', 'Class']].drop_duplicates()
class_df.index = class_df['Child']
class_dict = class_df['Class'].to_dict()



nx.set_node_attributes(peer_nominations_network, class_dict, 'Class')

pd.DataFrame(peer_nominations_network.out_degree(), columns = ['ChildID', "Out-degree"])

#peer_nominations_network.number_of_nodes()

#df_connections_nomination

Unnamed: 0,ChildID,Out-degree
0,1605,18
1,1606,13
2,1607,8
3,1608,18
4,1609,7
...,...,...
403,6085,5
404,6086,6
405,6088,10
406,6089,11


# Create Peer Nomination Network (Social Facilitation)

In [104]:
# Create the edges in the graph
peer_nomination_social_network = nx.DiGraph()

# Read questions to use
formula = json.loads(open('../input/connections_gen.json').read())

# Sum of all questions
max_score = sum(formula.values())

# Create a dictionary with the connections and weights
connections_dict = {}
for child in selected_participants:
    connections_dict[child] = {}

# create network based on unique child ids
for cl in df_nom3['Class'].unique():
    children = df_nom3[df_nom3.Class.isin([cl])].Child.unique()
    for key in children:
        peer_nomination_social_network.add_node(key)

# To avoid repetition of nominations in different waves
nominations_list = []
for line in df_nom4[['Child', 'Alter', 'Variable']].iterrows():
    (child, alter, var) = line[1]
    # Verify if nominated is in the list of participants (pp)
    if alter in selected_participants and (child, alter, var) not in nominations_list:
        # Add value in the key
        connections_dict[child][alter] = connections_dict[child].get(alter, 0) + 1*formula[var]
        nominations_list.append((child, alter, var))

connections_df = pd.DataFrame(connections_dict).fillna(0)/max_score
connections_dict = connections_df.to_dict()



In [105]:
df_edges = df_nom4[df_nom4.Variable == 'GEN_Social_Facilitation']

df_connections_nomination_social = []
for node in connections_dict.items():
    child = node[0]
    alter = node[1]
    for alt, weight in alter.items():
        if weight > 0:
            # only if there is an edge
            if(((df_edges['Child'] == child) & (df_edges['Alter'] == alt)).any()):
                peer_nomination_social_network.add_edge(child, alt, weight=weight)
                df_connections_nomination_social.append([child, alt, weight])

df_connections_nomination_social = pd.DataFrame(df_connections_nomination_social, columns = ["ChildID", "AlterID", "Weight"])

class_df = df_nom3[['Child', 'Class']].drop_duplicates()
class_df.index = class_df['Child']
class_dict = class_df['Class'].to_dict()

nx.set_node_attributes(peer_nomination_social_network, class_dict, 'Class')

pd.DataFrame(peer_nomination_social_network.out_degree(), columns = ['ChildID', "Out-degree"])


Unnamed: 0,ChildID,Out-degree
0,1605,2
1,1606,11
2,1607,2
3,1608,18
4,1609,6
...,...,...
403,6085,4
404,6086,5
405,6088,9
406,6089,11


# Create Online Communication Network

In [106]:
# Rename RecipientChild ID
alter = []
for part in df_com3['RecipientChild']:
    alter.append(float(str(part)[-6:]))

df_com3['Alter'] = alter
df_com3[['Class', 'Child', 'Participant', 'RecipientChild', 'Alter']]


# Only select alters present in the class of the child (i.e. participant)
df_com4 = []
for cl in df_com3['Class'].unique():
    df_temp = df_com3[df_com3.Class.isin([cl])]
    children_in_class = df_temp.Child.unique()
    df_temp = df_temp[df_temp.Alter.isin(children_in_class)]
    df_com4.append(df_temp)

df_com4 = pd.concat(df_com4)
df_com3.Child.unique()


array([5863., 5864., 5865., 5867., 5868., 5869., 5871., 5872., 5874.,
       5875., 5878., 5879., 5880., 5882., 5885., 5886., 5887., 5893.,
       5894., 5897., 5898., 5901., 5905., 5906., 5907., 5911., 5914.,
       5915., 5916., 5918., 5919., 5920., 5922., 2262., 2263., 2266.,
       2267., 2268., 2270., 2271., 2275., 2276., 2277., 2278., 2280.,
       2281., 2282., 2283., 3317., 4425., 4427., 6030., 2265., 5972.,
       5973., 5975., 5979., 5980., 5981., 5982., 5983., 5984., 5985.,
       5988., 5991., 5992., 5993., 5994., 5995., 5996., 5997., 5976.,
       5999., 6001., 6002., 6003., 6005., 6006., 6007., 6010., 6011.,
       6012., 6013., 6015., 6016., 6017., 6019., 6020., 6021., 6023.,
       6024., 6025., 6026., 6028., 6029., 2858., 2859., 2860., 2862.,
       2863., 2865., 2866., 2868., 2869., 2872., 2873., 2875., 2877.,
       2880., 2882., 2883., 2885., 5945., 5946., 5949., 5950., 5951.,
       5952., 5954., 5955., 5956., 5957., 5958., 5959., 5960., 5961.,
       5962., 5964.,

In [107]:
# Create Network

online_communication_network = nx.DiGraph()

df_connections_communication = []
#create the connections - edges
for cl in df_com4['Class'].unique():
    # participants in a class
    df_temp = df_com4[df_com4.Class.isin([cl])]

    children = df_com3[df_com3.Class.isin([cl])].Child.unique()
    for key in children:
        online_communication_network.add_node(key)

    # Number of messages sent by Participant to Recipient Child
    df_temp = df_temp.groupby(["Child","Alter"])["UMID"].count().reset_index(name='NumberMessages')

    # Weight: Number of messages per participant divided by total number of messages sent in a class
    df_temp['Weight'] = df_temp['NumberMessages']/max(df_temp['NumberMessages'])

    for index, row in df_temp.iterrows():
        online_communication_network.add_edge(row[0], row[1], weight=row[3])
        df_connections_communication.append([row[0], row[1], row[3]])

# class per participant
class_df = df_com3[['Child', 'Class']].drop_duplicates()
class_df.index = class_df['Child']
class_dict = class_df['Class'].to_dict()

nx.set_node_attributes(online_communication_network, class_dict, 'Class')

df_connections_communication = pd.DataFrame(df_connections_communication, columns = ["ChildID", "AlterID", "Weight"])

df_connections_communication

online_communication_network.number_of_nodes()

408

# Peer-nomination Descriptives

In [108]:
# Number of nominations
df_number_nominations_given = df_nom4.groupby(['Class','Child'])['Variable'].count().reset_index(name="NumNominations")
df_number_nominations_given['NumNominations'].describe()


count    405.000000
mean      14.987654
std        9.400197
min        1.000000
25%        8.000000
50%       13.000000
75%       19.000000
max       55.000000
Name: NumNominations, dtype: float64

In [109]:
# peer-nominations by class
df_number_nominations_given.groupby('Class')['NumNominations'].sum().reset_index(name="TotalNominations")


Unnamed: 0,Class,TotalNominations
0,78,148
1,81,288
2,100,297
3,124,331
4,125,207
5,256,270
6,258,156
7,261,242
8,262,146
9,263,276


In [110]:
# peer-nominations by class describe
df_number_nominations_given.groupby('Class')['NumNominations'].sum().reset_index(name="TotalNominations").describe()

Unnamed: 0,Class,TotalNominations
count,21.0,21.0
mean,240.047619,289.047619
std,81.553955,109.896986
min,78.0,146.0
25%,256.0,241.0
50%,272.0,276.0
75%,298.0,331.0
max,303.0,588.0


In [111]:
# Total peer-nominations
df_number_nominations_given['NumNominations'].sum()

6070

In [112]:
# Mean number of connections per class
#df_number_nominations_given.groupby('Class')['NumNominations'].describe()
df_nom_connections = df_nom4.groupby(['Class', 'Child', 'Alter']).size().reset_index(name="Size")
df_nom_connections2 = df_nom_connections.groupby(['Class', 'Child']).size().reset_index(name="NumConnections")
df_nom_connections2.groupby('Class')['NumConnections'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,17.0,6.705882,3.368321,3.0,5.0,5.0,7.0,16.0
81,19.0,9.684211,5.044828,3.0,6.0,8.0,14.0,18.0
100,19.0,9.0,4.654747,1.0,5.5,9.0,11.0,18.0
124,17.0,10.647059,3.920159,4.0,9.0,10.0,14.0,16.0
125,16.0,8.25,4.464676,2.0,4.0,7.5,11.25,15.0
256,22.0,7.0,4.070802,1.0,4.25,6.0,8.0,21.0
258,15.0,6.8,2.541091,2.0,5.0,8.0,8.5,10.0
261,17.0,8.058824,4.892221,1.0,4.0,9.0,10.0,16.0
262,15.0,5.533333,1.45733,3.0,5.0,5.0,6.0,9.0
263,20.0,8.15,4.05586,3.0,5.75,7.5,10.25,19.0


In [113]:
# Mean number of connections Total
df_nom_connections2['NumConnections'].describe()


count    405.000000
mean       9.283951
std        5.036094
min        1.000000
25%        5.000000
50%        8.000000
75%       12.000000
max       24.000000
Name: NumConnections, dtype: float64

In [114]:
# Weight of connections

class_list = input_args['classes']

class_dictionary = {}
for c in class_list:
    class_dictionary[c] = []

for node, key in peer_nominations_network.nodes.data('Class'):
    if key in class_dictionary:
        class_dictionary[int(key)].append(node)

list_subgraphs_nom = []
for c in class_list:
    subgraph = peer_nominations_network.subgraph(class_dictionary[c]).copy()
    subgraph.graph['Class']=c
    list_subgraphs_nom.append(subgraph)

population_list = []
node_data_list = []
for subgraph in list_subgraphs_nom:
    dict_in_degree = dict(nx.in_degree_centrality(subgraph))
    dict_closeness = dict(nx.closeness_centrality(subgraph))
    dict_betweenness = dict(nx.betweenness_centrality(subgraph))
    total_agents = subgraph.number_of_nodes()

    for nodedata in subgraph.nodes().data():
        #calculating the average weight
        num_edges = len(subgraph.edges(nodedata[0],data=True))
        avg_weight = 0
        if(num_edges > 0):
            for (u, v, wt) in subgraph.edges(nodedata[0],data=True):
                avg_weight = avg_weight +  wt['weight']

            avg_weight = avg_weight/num_edges

        #participant-level data
        node_data_list.append([nodedata[0], nodedata[1]['Class'], num_edges, dict_in_degree[nodedata[0]], dict_closeness[nodedata[0]],dict_betweenness[nodedata[0]],avg_weight])

    #population level data
    population_list.append([nodedata[1]['Class'], subgraph.number_of_edges(), round(nx.density(subgraph),2)])

#create the dataframes
df_class_nom = pd.DataFrame(population_list, columns = ["SchoolClassID", "NumberConnections", "Density"])

df_participant_nom = pd.DataFrame(node_data_list, columns = ["ParticipantID","SchoolClassID", "num_edges", "InDegree", "Closeness", "Betweenness", "Average_Weight"])


In [115]:

df_participant_nom.groupby('SchoolClassID')['num_edges'].sum()

df_participant_nom['num_edges'].sum()



3760

In [116]:
# Weight of connection by class
df_participant_nom.groupby('SchoolClassID')['Average_Weight'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,18.0,0.31007,0.095897,0.0,0.287202,0.3,0.3875,0.416667
81,19.0,0.383636,0.06192,0.25,0.346875,0.392857,0.416667,0.513889
100,19.0,0.415977,0.106049,0.25,0.337121,0.375,0.509615,0.625
124,17.0,0.448661,0.090836,0.285714,0.384615,0.45,0.5,0.625
125,16.0,0.377755,0.076806,0.25,0.320617,0.375,0.412202,0.566667
256,22.0,0.439558,0.110731,0.25,0.358766,0.432292,0.51875,0.666667
258,16.0,0.357999,0.128956,0.0,0.334201,0.366071,0.411111,0.583333
261,17.0,0.403617,0.130615,0.25,0.3125,0.375,0.454545,0.765625
262,15.0,0.434021,0.12202,0.25,0.3625,0.4,0.506944,0.708333
263,20.0,0.434598,0.131843,0.25,0.331731,0.420833,0.476562,0.729167


In [117]:
# Weight of connection (total)
df_participant_nom['Average_Weight'].describe()


count    408.000000
mean       0.398581
std        0.108919
min        0.000000
25%        0.326923
50%        0.375000
75%        0.450781
max        0.765625
Name: Average_Weight, dtype: float64

In [118]:
# density by class
df_class_nom.groupby('SchoolClassID')['Density'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,1.0,0.37,,0.37,0.37,0.37,0.37,0.37
81,1.0,0.54,,0.54,0.54,0.54,0.54,0.54
100,1.0,0.5,,0.5,0.5,0.5,0.5,0.5
124,1.0,0.67,,0.67,0.67,0.67,0.67,0.67
125,1.0,0.55,,0.55,0.55,0.55,0.55,0.55
256,1.0,0.33,,0.33,0.33,0.33,0.33,0.33
258,1.0,0.42,,0.42,0.42,0.42,0.42,0.42
261,1.0,0.5,,0.5,0.5,0.5,0.5,0.5
262,1.0,0.4,,0.4,0.4,0.4,0.4,0.4
263,1.0,0.43,,0.43,0.43,0.43,0.43,0.43


In [119]:
# density by class (overall)
df_class_nom['Density'].describe()

count    21.000000
mean      0.489524
std       0.087720
min       0.330000
25%       0.420000
50%       0.500000
75%       0.550000
max       0.670000
Name: Density, dtype: float64

In [120]:
# centrality measures
df_participant_nom.groupby('SchoolClassID')['InDegree', 'Closeness', 'Betweenness'].describe()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,Closeness,Closeness,Closeness,Closeness,Closeness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
78,18.0,0.372549,0.112337,0.176471,0.308824,0.352941,0.411765,0.588235,18.0,0.572578,...,0.596561,0.684492,18.0,0.039216,0.068398,0.0,0.009624,0.019706,0.036022,0.3021
81,19.0,0.538012,0.108064,0.388889,0.444444,0.555556,0.638889,0.722222,19.0,0.685283,...,0.735,0.782609,19.0,0.02752,0.024832,0.002682,0.008896,0.01481,0.043487,0.080444
100,19.0,0.5,0.159302,0.166667,0.416667,0.5,0.611111,0.722222,19.0,0.647174,...,0.679487,0.782609,19.0,0.033368,0.045729,0.0,0.006401,0.020732,0.037771,0.175
124,17.0,0.665441,0.192563,0.125,0.625,0.6875,0.8125,0.875,17.0,0.762336,...,0.842105,0.888889,17.0,0.022304,0.02482,0.0,0.004282,0.014606,0.031238,0.081713
125,16.0,0.55,0.140897,0.333333,0.466667,0.533333,0.6,0.866667,16.0,0.693391,...,0.714286,0.882353,16.0,0.032738,0.038255,0.002937,0.005813,0.018475,0.044473,0.138764
256,22.0,0.333333,0.08939,0.142857,0.285714,0.333333,0.380952,0.47619,22.0,0.516636,...,0.567568,0.65625,22.0,0.049134,0.063311,0.0,0.004663,0.008869,0.096976,0.169708
258,16.0,0.425,0.23078,0.133333,0.133333,0.566667,0.6,0.733333,16.0,0.427824,...,0.522667,0.789474,16.0,0.039286,0.061616,0.0,0.0,0.007234,0.052449,0.204762
261,17.0,0.503676,0.125916,0.25,0.4375,0.5,0.5625,0.75,17.0,0.596991,...,0.615385,0.761905,17.0,0.045833,0.073774,0.000521,0.007986,0.010243,0.0625,0.288681
262,15.0,0.395238,0.217437,0.142857,0.25,0.357143,0.5,0.928571,15.0,0.548941,...,0.666667,0.933333,15.0,0.076923,0.079108,0.0,0.013965,0.059524,0.116712,0.272253
263,20.0,0.428947,0.134754,0.210526,0.355263,0.421053,0.539474,0.684211,20.0,0.593646,...,0.638793,0.730769,20.0,0.039766,0.045039,0.0,0.007432,0.021273,0.054431,0.169087


In [121]:
df_participant_nom[['InDegree', 'Closeness', 'Betweenness']].describe()

Unnamed: 0,InDegree,Closeness,Betweenness
count,408.0,408.0,408.0
mean,0.48894,0.633578,0.03274
std,0.163082,0.113246,0.043974
min,0.1,0.152381,0.0
25%,0.386905,0.571429,0.00629
50%,0.5,0.641111,0.015657
75%,0.6,0.697244,0.040499
max,0.944444,0.944444,0.3021


In [122]:
fig = px.histogram(df_participant_nom,
                   x='InDegree',
                   nbins=400)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="In-degree centrality", yaxis_title="Number",
    margin = dict(l=60, r=20, t=15, b=40)
)
fig.show()


# Peer-Nomination (Social Facilitation) Descriptives

In [123]:
# Number of nominations
df_number_nomination_social_given = df_edges.groupby(['Class','Child'])['Variable'].count().reset_index(name="NumNominations")
df_number_nomination_social_given['NumNominations'].describe()


count    404.000000
mean       7.581683
std        4.961966
min        1.000000
25%        4.000000
50%        6.000000
75%       10.000000
max       24.000000
Name: NumNominations, dtype: float64

In [124]:
# peer-nominations by class
df_number_nomination_social_given.groupby('Class')['NumNominations'].sum().reset_index(name="TotalNominations")


Unnamed: 0,Class,TotalNominations
0,78,81
1,81,136
2,100,131
3,124,154
4,125,117
5,256,105
6,258,86
7,261,102
8,262,64
9,263,135


In [125]:
# peer-nominations by class
df_number_nomination_social_given.groupby('Class')['NumNominations'].sum().reset_index(name="TotalNominations").describe()


Unnamed: 0,Class,TotalNominations
count,21.0,21.0
mean,240.047619,145.857143
std,81.553955,59.37953
min,78.0,64.0
25%,256.0,109.0
50%,272.0,132.0
75%,298.0,169.0
max,303.0,291.0


In [126]:
# Total peer-nominations
df_number_nomination_social_given['NumNominations'].sum()

3063

In [127]:
# Mean number of connections per class
#df_number_nominations_given.groupby('Class')['NumNominations'].describe()
df_nom_connection_social = df_edges.groupby(['Class', 'Child', 'Alter']).size().reset_index(name="Size")
df_nom_connection_social2 = df_nom_connection_social.groupby(['Class', 'Child']).size().reset_index(name="NumConnections")
df_nom_connection_social2.groupby('Class')['NumConnections'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,17.0,4.764706,3.400908,2.0,3.0,4.0,5.0,16.0
81,19.0,7.157895,5.156045,2.0,3.0,6.0,10.0,18.0
100,18.0,7.277778,3.528132,1.0,4.25,7.5,9.0,15.0
124,17.0,9.058824,3.766142,4.0,6.0,9.0,12.0,16.0
125,16.0,7.3125,4.512483,1.0,3.75,7.0,11.0,15.0
256,22.0,4.772727,1.823963,1.0,3.25,5.0,6.0,9.0
258,15.0,5.733333,2.814926,1.0,3.0,7.0,8.0,9.0
261,17.0,6.0,3.691206,1.0,2.0,7.0,10.0,10.0
262,15.0,4.266667,1.709915,1.0,3.5,4.0,5.5,7.0
263,20.0,6.75,4.314907,1.0,4.5,6.0,8.5,19.0


In [128]:
# Mean number of connections Total
df_nom_connection_social2['NumConnections'].describe()


count    404.000000
mean       7.581683
std        4.961966
min        1.000000
25%        4.000000
50%        6.000000
75%       10.000000
max       24.000000
Name: NumConnections, dtype: float64

In [129]:
# Weight of connections

class_list = input_args['classes']

class_dictionary = {}
for c in class_list:
    class_dictionary[c] = []

for node, key in peer_nomination_social_network.nodes.data('Class'):
    if key in class_dictionary:
        class_dictionary[int(key)].append(node)

list_subgraphs_nom_soc = []
for c in class_list:
    subgraph = peer_nomination_social_network.subgraph(class_dictionary[c]).copy()
    subgraph.graph['Class']=c
    list_subgraphs_nom_soc.append(subgraph)

population_list = []
node_data_list = []
for subgraph in list_subgraphs_nom_soc:
    dict_in_degree = dict(nx.in_degree_centrality(subgraph))
    dict_closeness = dict(nx.closeness_centrality(subgraph))
    dict_betweenness = dict(nx.betweenness_centrality(subgraph))
    total_agents = subgraph.number_of_nodes()

    for nodedata in subgraph.nodes().data():
        #calculating the average weight
        num_edges = len(subgraph.edges(nodedata[0],data=True))
        avg_weight = 0
        if(num_edges > 0):
            for (u, v, wt) in subgraph.edges(nodedata[0],data=True):
                avg_weight = avg_weight +  wt['weight']

            avg_weight = avg_weight/num_edges

        #participant-level data
        node_data_list.append([nodedata[0], nodedata[1]['Class'], num_edges, dict_in_degree[nodedata[0]], dict_closeness[nodedata[0]],dict_betweenness[nodedata[0]],avg_weight])

    #population level data
    population_list.append([nodedata[1]['Class'], subgraph.number_of_edges(), round(nx.density(subgraph),2)])

#create the dataframes
df_class_nom_soc = pd.DataFrame(population_list, columns = ["SchoolClassID", "NumberConnections", "Density"])

df_participant_nom_soc = pd.DataFrame(node_data_list, columns = ["ParticipantID","SchoolClassID", "num_edges", "InDegree", "Closeness", "Betweenness", "Average_Weight"])


In [130]:
df_participant_nom_soc.groupby('SchoolClassID')['num_edges'].sum()

df_participant_nom_soc['num_edges'].sum()


3063

In [131]:
# Weight of connection by class
df_participant_nom_soc.groupby('SchoolClassID')['Average_Weight'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,18.0,0.342998,0.122549,0.0,0.292969,0.333333,0.428125,0.5
81,19.0,0.460097,0.137099,0.25,0.380556,0.416667,0.506944,0.75
100,19.0,0.459285,0.225957,0.0,0.338542,0.375,0.556548,1.0
124,17.0,0.483188,0.127106,0.25,0.416667,0.5,0.535714,0.75
125,16.0,0.38972,0.120701,0.25,0.316761,0.37358,0.425,0.75
256,22.0,0.499874,0.126341,0.25,0.385417,0.5,0.583333,0.708333
258,16.0,0.389931,0.144568,0.0,0.338889,0.383929,0.5,0.583333
261,17.0,0.470979,0.199585,0.25,0.3125,0.472222,0.5,0.972222
262,15.0,0.455119,0.121295,0.25,0.3875,0.458333,0.520833,0.708333
263,20.0,0.447522,0.132415,0.25,0.339912,0.45,0.5,0.729167


In [132]:
# Weight of connection (total)
df_participant_nom_soc['Average_Weight'].describe()

count    408.000000
mean       0.434472
std        0.142935
min        0.000000
25%        0.333333
50%        0.416667
75%        0.500000
max        1.000000
Name: Average_Weight, dtype: float64

In [133]:
# density by class
df_class_nom_soc.groupby('SchoolClassID')['Density'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,1.0,0.26,,0.26,0.26,0.26,0.26,0.26
81,1.0,0.4,,0.4,0.4,0.4,0.4,0.4
100,1.0,0.38,,0.38,0.38,0.38,0.38,0.38
124,1.0,0.57,,0.57,0.57,0.57,0.57,0.57
125,1.0,0.49,,0.49,0.49,0.49,0.49,0.49
256,1.0,0.23,,0.23,0.23,0.23,0.23,0.23
258,1.0,0.36,,0.36,0.36,0.36,0.36,0.36
261,1.0,0.38,,0.38,0.38,0.38,0.38,0.38
262,1.0,0.3,,0.3,0.3,0.3,0.3,0.3
263,1.0,0.36,,0.36,0.36,0.36,0.36,0.36


In [134]:
# density by class (overall)
df_class_nom_soc['Density'].describe()

count    21.000000
mean      0.398571
std       0.089067
min       0.230000
25%       0.360000
50%       0.400000
75%       0.450000
max       0.570000
Name: Density, dtype: float64

In [135]:
# centrality measures
df_participant_nom_soc.groupby('SchoolClassID')['InDegree', 'Closeness', 'Betweenness'].describe()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,Closeness,Closeness,Closeness,Closeness,Closeness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
78,18.0,0.264706,0.105323,0.058824,0.235294,0.235294,0.338235,0.470588,18.0,0.379884,...,0.452973,0.557734,18.0,0.097222,0.117896,0.0,0.014813,0.075827,0.119271,0.395833
81,19.0,0.397661,0.109968,0.166667,0.333333,0.388889,0.444444,0.611111,19.0,0.571878,...,0.62069,0.72,19.0,0.04558,0.067198,0.0,0.004552,0.018127,0.059357,0.289047
100,19.0,0.383041,0.145691,0.0,0.333333,0.388889,0.5,0.555556,19.0,0.530197,...,0.629872,0.642222,19.0,0.038184,0.044808,0.0,0.007441,0.017996,0.061585,0.170882
124,17.0,0.566176,0.173248,0.125,0.5,0.5625,0.6875,0.8125,17.0,0.702655,...,0.761905,0.842105,17.0,0.029657,0.0391,0.0,0.004614,0.01158,0.036689,0.13157
125,16.0,0.4875,0.125831,0.266667,0.4,0.466667,0.55,0.666667,16.0,0.620607,...,0.659585,0.714286,16.0,0.045238,0.046984,0.0,0.009881,0.03148,0.072253,0.150045
256,22.0,0.227273,0.079045,0.0,0.202381,0.238095,0.285714,0.380952,22.0,0.319357,...,0.388889,0.525,22.0,0.039394,0.091568,0.0,0.000744,0.001885,0.009425,0.302381
258,16.0,0.358333,0.209231,0.0,0.133333,0.433333,0.533333,0.6,16.0,0.392564,...,0.543889,0.6,16.0,0.009821,0.014694,0.0,0.00051,0.003475,0.013914,0.05
261,17.0,0.375,0.198874,0.0625,0.1875,0.5,0.5,0.625,17.0,0.399281,...,0.520833,0.625,17.0,0.007843,0.005534,0.0,0.004167,0.007961,0.0125,0.016667
262,15.0,0.304762,0.14149,0.071429,0.214286,0.285714,0.428571,0.571429,15.0,0.362881,...,0.454887,0.617347,15.0,0.109158,0.109872,0.0,0.000687,0.092033,0.147894,0.326007
263,20.0,0.355263,0.102278,0.210526,0.263158,0.368421,0.434211,0.526316,20.0,0.477613,...,0.598538,0.633333,20.0,0.026316,0.051103,0.0,0.001008,0.008567,0.024854,0.22807


In [136]:
df_participant_nom_soc[['InDegree', 'Closeness', 'Betweenness']].describe()

Unnamed: 0,InDegree,Closeness,Betweenness
count,408.0,408.0,408.0
mean,0.398017,0.536324,0.037863
std,0.15054,0.146722,0.058739
min,0.0,0.0,0.0
25%,0.285714,0.470588,0.003558
50%,0.416667,0.571429,0.014158
75%,0.5,0.638889,0.044774
max,0.8125,0.842105,0.395833


In [137]:
fig = px.histogram(df_participant_nom_soc,
                   x='InDegree',
                   nbins=50)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="In-degree centrality", yaxis_title="Number",
    margin = dict(l=60, r=20, t=15, b=40)
)
fig.show()


# Online Communication Descriptives

In [138]:
# Number of messages by class

df_number_messages = df_com4.groupby(['Class', 'Child'])['UMID'].count().reset_index(name="NumMessages")
df_number_messages['NumMessages'].describe()


count    403.000000
mean      63.868486
std      104.088367
min        1.000000
25%        9.000000
50%       23.000000
75%       69.500000
max      786.000000
Name: NumMessages, dtype: float64

In [139]:
# Number of messages by class

msg_per_class = df_number_messages.groupby(['Class'])['NumMessages'].sum().reset_index(name="TotalMessages")
msg_per_class


Unnamed: 0,Class,TotalMessages
0,78.0,221
1,81.0,143
2,100.0,569
3,124.0,356
4,125.0,561
5,256.0,259
6,258.0,168
7,261.0,448
8,262.0,263
9,263.0,1013


In [140]:
msg_per_class.describe()


Unnamed: 0,Class,TotalMessages
count,21.0,21.0
mean,240.047619,1225.666667
std,81.553955,1309.488653
min,78.0,143.0
25%,256.0,356.0
50%,272.0,569.0
75%,298.0,2168.0
max,303.0,5301.0


In [141]:
# Total number of messages
df_number_messages['NumMessages'].sum()



25739

In [142]:

pairs_freq = df_com4.groupby(["Participant","RecipientChild"])["UMID"].count().reset_index(name='NumberMessages')
pairs_freq

pairs_freq = pairs_freq.astype({'Participant': 'float64'})

pairs = []
values = []
exchanged_messages = []
for line in pairs_freq[["Participant","RecipientChild", "NumberMessages"]].iterrows():
    (p1, p2, var) = line[1]
    if(p1, p2) not in pairs and (p2, p1) not in pairs:
        pairs.append((p1,p2))
        values.append((var))

    elif (p2, p1) in pairs:
        # get the index of this pair and update in the values of the same index
        ind = pairs.index((p2,p1))
        values[ind] = (values[ind] + var)

# print('found pairs',pairs)
# print('found values',values)


count = 0
for line in pairs:
    (pp1, pp2) = line
    exchanged_messages.append((pp1,pp2,values[count]))
    count = count + 1

exchanged_messages = pd.DataFrame(exchanged_messages, columns = ["P1", "P2", "NumMes"])
exchanged_messages.sort_values(by='NumMes',ascending=False).head(10)
exchanged_messages['NumMes'].describe()

count    1860.000000
mean       13.838172
std        39.246523
min         1.000000
25%         2.000000
50%         4.000000
75%        10.000000
max       637.000000
Name: NumMes, dtype: float64

In [143]:
# Plot
fig = px.histogram(exchanged_messages,
                   x="NumMes",
                   nbins=100)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="Number of Exchanged Messages", yaxis_title="Number of pairs",
    margin = dict(l=60, r=20, t=15, b=40)
)
fig.show()

fig.write_image('../output/plots/exchangedMessages.svg', width=1000, height=600)

In [144]:
# Plot (log-trans)
fig = px.histogram(exchanged_messages,
                   x="NumMes",
                   nbins=100)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="Number of Exchanged Messages", yaxis_title="Number of pairs (log scale)",
    yaxis=dict(type='log')
)
fig.show()
fig.write_image('../output/plots/exchangedMessages_log.svg', width=1000, height=600)

In [145]:
# Mean number of connections per class

df_com_connections = df_com3.groupby(['Class', 'Child', 'Alter']).size().reset_index(name="Size")
df_com_connections2 = df_com_connections.groupby(['Class', 'Child']).size().reset_index(name="NumConnections")
df_com_connections2.groupby('Class')['NumConnections'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,18.0,3.0,2.0,1.0,1.25,2.5,4.5,7.0
81.0,19.0,4.526316,6.266032,1.0,1.0,2.0,5.5,26.0
100.0,19.0,4.947368,2.634677,1.0,3.0,5.0,7.0,10.0
124.0,17.0,4.647059,3.534494,1.0,2.0,4.0,6.0,16.0
125.0,16.0,4.75,4.946379,1.0,1.75,2.5,5.5,16.0
256.0,22.0,3.272727,1.85631,1.0,2.0,3.0,5.0,7.0
258.0,16.0,2.5,1.549193,1.0,1.75,2.0,3.0,7.0
261.0,17.0,5.294118,3.584074,2.0,3.0,5.0,6.0,16.0
262.0,15.0,4.133333,3.020564,1.0,3.0,3.0,4.5,14.0
263.0,20.0,8.15,4.637093,2.0,6.0,7.0,9.25,19.0


In [146]:
df_com_connections2

Unnamed: 0,Class,Child,NumConnections
0,78.0,1633.0,7
1,78.0,1636.0,3
2,78.0,1637.0,1
3,78.0,1639.0,1
4,78.0,1640.0,1
...,...,...,...
403,303.0,6085.0,5
404,303.0,6086.0,6
405,303.0,6088.0,11
406,303.0,6089.0,7


In [147]:
# Mean number of connections Total
df_com_connections2['NumConnections'].describe()

count    408.000000
mean       7.928922
std        6.247346
min        1.000000
25%        3.000000
50%        6.000000
75%       11.000000
max       28.000000
Name: NumConnections, dtype: float64

In [148]:
# Subgraphs

class_list = input_args['classes']

class_dictionary = {}
for c in class_list:
    class_dictionary[c] = []

for node, key in online_communication_network.nodes.data('Class'):
    if key in class_dictionary:
        class_dictionary[int(key)].append(node)

list_subgraphs_com = []
for c in class_list:
    subgraph = online_communication_network.subgraph(class_dictionary[c]).copy()
    subgraph.graph['Class']=c
    list_subgraphs_com.append(subgraph)


population_list = []
node_data_list = []
for subgraph in list_subgraphs_com:

    dict_in_degree = dict(nx.in_degree_centrality(subgraph))
    dict_closeness = dict(nx.closeness_centrality(subgraph))
    dict_betweenness = dict(nx.betweenness_centrality(subgraph))

    total_agents = subgraph.number_of_nodes()

    for nodedata in subgraph.nodes().data():

        #calculating the average weight
        num_edges = len(subgraph.edges(nodedata[0],data=True))
        avg_weight = 0
        if(num_edges > 0):
            for (u, v, wt) in subgraph.edges(nodedata[0],data=True):
                avg_weight = avg_weight +  wt['weight']

            avg_weight = avg_weight/num_edges

        #participant-level data
        node_data_list.append([nodedata[0], nodedata[1]['Class'], dict_in_degree[nodedata[0]], dict_closeness[nodedata[0]],dict_betweenness[nodedata[0]],avg_weight])

    #population level data
    population_list.append([nodedata[1]['Class'], subgraph.number_of_edges(), round(nx.density(subgraph),2)])

#create the dataframes
df_class_com = pd.DataFrame(population_list, columns = ["SchoolClassID", "NumberConnections", "Density"])

df_participant_com = pd.DataFrame(node_data_list, columns = ["ParticipantID","SchoolClassID", "InDegree", "Closeness", "Betweenness", "Average_Weight"])



In [149]:
# Weight of connections by class
df_participant_com.groupby('SchoolClassID')['Average_Weight'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,18.0,0.143614,0.100094,0.0,0.071839,0.126437,0.193966,0.317241
81.0,19.0,0.311294,0.204817,0.0,0.155208,0.3125,0.375,0.875
100.0,19.0,0.061455,0.054753,0.012658,0.021926,0.046414,0.081224,0.222423
124.0,17.0,0.073921,0.08085,0.013699,0.018265,0.041096,0.082192,0.30137
125.0,16.0,0.063141,0.072895,0.009709,0.01699,0.031715,0.075243,0.24164
256.0,22.0,0.241856,0.200878,0.0,0.083333,0.1875,0.323438,0.875
258.0,16.0,0.175967,0.182832,0.0,0.057292,0.083333,0.278274,0.666667
261.0,17.0,0.103337,0.09575,0.025926,0.033333,0.044444,0.207407,0.311111
262.0,15.0,0.21162,0.203273,0.041667,0.111111,0.152778,0.192708,0.791667
263.0,20.0,0.074188,0.079257,0.014446,0.024579,0.05103,0.07718,0.303371


In [150]:
# Weight of connections (overall)
df_participant_com['Average_Weight'].describe()

count    408.000000
mean       0.103170
std        0.123498
min        0.000000
25%        0.026212
50%        0.057781
75%        0.134920
max        0.875000
Name: Average_Weight, dtype: float64

In [151]:
# Density by class
df_class_com.groupby('SchoolClassID')['Density'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,1.0,0.14,,0.14,0.14,0.14,0.14,0.14
81.0,1.0,0.21,,0.21,0.21,0.21,0.21,0.21
100.0,1.0,0.27,,0.27,0.27,0.27,0.27,0.27
124.0,1.0,0.28,,0.28,0.28,0.28,0.28,0.28
125.0,1.0,0.3,,0.3,0.3,0.3,0.3,0.3
256.0,1.0,0.14,,0.14,0.14,0.14,0.14,0.14
258.0,1.0,0.14,,0.14,0.14,0.14,0.14,0.14
261.0,1.0,0.33,,0.33,0.33,0.33,0.33,0.33
262.0,1.0,0.3,,0.3,0.3,0.3,0.3,0.3
263.0,1.0,0.43,,0.43,0.43,0.43,0.43,0.43


In [152]:
# Density (overall)
df_class_com['Density'].describe()

count    21.000000
mean      0.383810
std       0.179568
min       0.140000
25%       0.280000
50%       0.350000
75%       0.470000
max       0.810000
Name: Density, dtype: float64

In [153]:
# centrality measures
df_participant_com.groupby('SchoolClassID')['InDegree', 'Closeness', 'Betweenness'].describe()



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,Closeness,Closeness,Closeness,Closeness,Closeness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
78.0,18.0,0.137255,0.078142,0.0,0.117647,0.117647,0.176471,0.294118,18.0,0.25445,...,0.337245,0.413603,18.0,0.074959,0.070462,0.0,0.001379,0.060968,0.12883,0.193015
81.0,19.0,0.207602,0.099453,0.055556,0.138889,0.166667,0.277778,0.388889,19.0,0.41588,...,0.45873,0.535185,19.0,0.073787,0.107082,0.0,0.0,0.001634,0.118464,0.349129
100.0,19.0,0.269006,0.118956,0.055556,0.166667,0.277778,0.333333,0.5,19.0,0.512841,...,0.571573,0.6,19.0,0.057792,0.05929,0.0,0.019643,0.037099,0.076354,0.205766
124.0,17.0,0.279412,0.132799,0.0625,0.1875,0.25,0.375,0.5625,17.0,0.501765,...,0.516129,0.666667,17.0,0.067892,0.125116,0.0,0.005556,0.03588,0.063079,0.520764
125.0,16.0,0.304167,0.137639,0.133333,0.183333,0.3,0.4,0.533333,16.0,0.533646,...,0.576923,0.652174,16.0,0.064583,0.098901,0.0,0.0,0.032738,0.06751,0.334218
256.0,22.0,0.140693,0.072706,0.0,0.095238,0.142857,0.190476,0.238095,22.0,0.318011,...,0.373483,0.42328,22.0,0.087987,0.110388,0.0,0.000298,0.025198,0.158135,0.328571
258.0,16.0,0.1375,0.110135,0.0,0.066667,0.133333,0.15,0.466667,16.0,0.233938,...,0.317647,0.490909,16.0,0.028571,0.064006,0.0,0.0,0.0,0.019048,0.247619
261.0,17.0,0.330882,0.12454,0.125,0.25,0.3125,0.4375,0.625,17.0,0.553905,...,0.592593,0.727273,17.0,0.054902,0.116188,0.0,0.003413,0.015833,0.052431,0.492599
262.0,15.0,0.295238,0.129099,0.142857,0.214286,0.214286,0.357143,0.571429,15.0,0.498218,...,0.538462,0.608696,15.0,0.079487,0.147824,0.0,0.013324,0.033516,0.077289,0.593407
263.0,20.0,0.428947,0.085848,0.263158,0.368421,0.421053,0.473684,0.631579,20.0,0.599897,...,0.633333,0.703704,20.0,0.037573,0.052842,0.002193,0.006684,0.022494,0.040581,0.195705


In [154]:
df_participant_com[['InDegree','Closeness','Betweenness']].describe()

Unnamed: 0,InDegree,Closeness,Betweenness
count,408.0,408.0,408.0
mean,0.394483,0.563494,0.044937
std,0.215439,0.171876,0.075437
min,0.0,0.0,0.0
25%,0.222222,0.473684,0.003262
50%,0.375,0.580645,0.01658
75%,0.55,0.666667,0.050366
max,1.0,1.0,0.593407


In [155]:
fig = px.histogram(df_participant_com,
                   x='InDegree',
                   nbins=50)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="In-degree centrality", yaxis_title="Number",
    margin = dict(l=60, r=20, t=15, b=40)
)
fig.show()


In [156]:
df_participant_com

Unnamed: 0,ParticipantID,SchoolClassID,InDegree,Closeness,Betweenness,Average_Weight
0,1633.0,78.0,0.176471,0.339367,0.101716,0.051724
1,1636.0,78.0,0.176471,0.378151,0.193015,0.034483
2,1637.0,78.0,0.117647,0.259516,0.009804,0.137931
3,1639.0,78.0,0.000000,0.000000,0.000000,0.103448
4,1640.0,78.0,0.058824,0.228195,0.000000,0.275862
...,...,...,...,...,...,...
403,6085.0,303.0,0.333333,0.571429,0.003356,0.011570
404,6086.0,303.0,0.250000,0.558140,0.008166,0.008953
405,6088.0,303.0,0.416667,0.615385,0.019777,0.039444
406,6089.0,303.0,0.250000,0.558140,0.001958,0.014168


# Network Similarity (Peer-Nomination vs. Online Communication)

In [157]:
class_network_similarity = []
for i in range(0,21):
    g1 = list_subgraphs_nom[i]
    g2 = list_subgraphs_com[i]


    max_edges = g1.number_of_nodes()**2-g1.number_of_nodes()
    true_positive=set(g1.edges()).intersection(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in GT but not discovered in BT
    false_negative=set(g1.edges()).difference(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in BT but not discovered in GT
    false_positive=set(g2.edges()).difference(set(g1.edges()))
    # not sure if true_negative should be actually 0, union or set(g1.edges()).
    # I think the last makes most sense since it represents the difference between all possible edges and the ones that are actually in the ground truth data, i.e. the nomination-based graph
    union=set(g1.edges()).union(set(g2.edges()))
    true_negative=max_edges-len(union)
    d1=(len(true_positive)+true_negative) if (len(true_positive)+true_negative)>0 else -1
    d2=(len(true_positive)+true_negative+len(false_negative)+len(false_positive)) if (len(true_positive)+true_negative+len(false_negative)+len(false_positive))>0 else -1
    network_similarity=d1/d2

    class_network_similarity.append(network_similarity)

pd.DataFrame(class_network_similarity)

Unnamed: 0,0
0,0.705882
1,0.535088
2,0.652047
3,0.5625
4,0.4875
5,0.755411
6,0.704167
7,0.643382
8,0.747619
9,0.668421


In [158]:
print("Mean:", sum(class_network_similarity)/len(class_network_similarity))

Mean: 0.6510657701815905


# Network Similarity (Peer-Nomination Social vs. Online Communication)


In [159]:
class_network_similarity = []
for i in range(0,21):
    g1 = list_subgraphs_nom_soc[i]
    g2 = list_subgraphs_com[i]


    max_edges = g1.number_of_nodes()**2-g1.number_of_nodes()
    true_positive=set(g1.edges()).intersection(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in GT but not discovered in BT
    false_negative=set(g1.edges()).difference(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in BT but not discovered in GT
    false_positive=set(g2.edges()).difference(set(g1.edges()))
    # not sure if true_negative should be actually 0, union or set(g1.edges()).
    # I think the last makes most sense since it represents the difference between all possible edges and the ones that are actually in the ground truth data, i.e. the nomination-based graph
    union=set(g1.edges()).union(set(g2.edges()))
    true_negative=max_edges-len(union)
    d1=(len(true_positive)+true_negative) if (len(true_positive)+true_negative)>0 else -1
    d2=(len(true_positive)+true_negative+len(false_negative)+len(false_positive)) if (len(true_positive)+true_negative+len(false_negative)+len(false_positive))>0 else -1
    network_similarity=d1/d2

    class_network_similarity.append(network_similarity)

pd.DataFrame(class_network_similarity)


Unnamed: 0,0
0,0.794118
1,0.640351
2,0.710526
3,0.595588
4,0.516667
5,0.848485
6,0.7625
7,0.727941
8,0.809524
9,0.7


In [162]:
class_network_similarity = []

tp = []
fn = []
fp = []
tn = []
for i in range(0,21):
    g1 = list_subgraphs_nom_soc[i]
    g2 = list_subgraphs_com[i]


    max_edges = g1.number_of_nodes()**2-g1.number_of_nodes()
    true_positive=set(g1.edges()).intersection(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in GT but not discovered in BT
    false_negative=set(g1.edges()).difference(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in BT but not discovered in GT
    false_positive=set(g2.edges()).difference(set(g1.edges()))
    # not sure if true_negative should be actually 0, union or set(g1.edges()).
    # I think the last makes most sense since it represents the difference between all possible edges and the ones that are actually in the ground truth data, i.e. the nomination-based graph
    union=set(g1.edges()).union(set(g2.edges()))
    true_negative=max_edges-len(union)
    d1=(len(true_positive)+true_negative) if (len(true_positive)+true_negative)>0 else -1
    d2=(len(true_positive)+true_negative+len(false_negative)+len(false_positive)) if (len(true_positive)+true_negative+len(false_negative)+len(false_positive))>0 else -1
    network_similarity=d1/d2


    tn.append(true_positive)
    fn.append(false_negative)
    fp.append(false_positive)
    tn.append(true_negative)

    class_network_similarity.append(network_similarity)

pd.DataFrame(class_network_similarity)


Unnamed: 0,0
0,0.794118
1,0.640351
2,0.710526
3,0.595588
4,0.516667
5,0.848485
6,0.7625
7,0.727941
8,0.809524
9,0.7


In [163]:
print("Mean:", sum(class_network_similarity)/len(class_network_similarity))

Mean: 0.686271034544692


# Statistical test comparing network density between networks

In [164]:
import scipy.stats as stats

# Test normality
dens_norm = stats.shapiro(df_class_nom['Density']- df_class_com['Density'])
print(dens_norm) # normal distribution

dens_norm = stats.shapiro(df_class_nom_soc['Density']- df_class_com['Density'])
print('soc', dens_norm) # normal distribution


# Test homogeneity
dens_hom =  stats.levene(df_class_nom['Density'], df_class_com['Density'], center= 'mean')
print(dens_hom) # no equal variance

dens_hom =  stats.levene(df_class_nom_soc['Density'], df_class_com['Density'], center= 'mean')
print('soc', dens_hom) # no equal variance


print(stats.ttest_ind(a=df_class_nom['Density'], b=df_class_com['Density']))
print('soc', stats.ttest_ind(a=df_class_nom_soc['Density'], b=df_class_com['Density']))

ShapiroResult(statistic=0.9475237727165222, pvalue=0.30558153986930847)
soc ShapiroResult(statistic=0.9452391862869263, pvalue=0.27606627345085144)
LeveneResult(statistic=8.038241311515886, pvalue=0.007149048732339385)
soc LeveneResult(statistic=7.652917792044718, pvalue=0.00853861151985465)
Ttest_indResult(statistic=2.4240517424590253, pvalue=0.019962010118200483)
soc Ttest_indResult(statistic=0.33748925334646335, pvalue=0.7375127968725661)


In [165]:
# test density
print(stats.mannwhitneyu(x=df_class_nom['Density'], y=df_class_com['Density'], alternative = 'two-sided'))
print('soc', stats.mannwhitneyu(x=df_class_nom_soc['Density'], y=df_class_com['Density'], alternative = 'two-sided'))

MannwhitneyuResult(statistic=329.0, pvalue=0.006556224006376486)
soc MannwhitneyuResult(statistic=249.5, pvalue=0.4731249756345203)


# Statistical test comparing centrality measures between the two networks

In [166]:
stats.mannwhitneyu(x=df_participant_nom_soc['InDegree'], y=df_participant_com['InDegree'], alternative = 'two-sided')

MannwhitneyuResult(statistic=87311.0, pvalue=0.22554648678773703)

In [167]:
stats.mannwhitneyu(x=df_participant_nom_soc['Betweenness'], y=df_participant_com['Betweenness'], alternative = 'two-sided')

MannwhitneyuResult(statistic=81823.5, pvalue=0.6755754343684144)

In [168]:
stats.mannwhitneyu(x=df_participant_nom_soc['Closeness'], y=df_participant_com['Closeness'], alternative = 'two-sided')

MannwhitneyuResult(statistic=75104.0, pvalue=0.01576087459971613)

In [169]:
# By class

# In-Degree
for ClassID in df_participant_nom.SchoolClassID.unique():
    print(ClassID, " InDegree", stats.mannwhitneyu(x=df_participant_nom_soc[df_participant_nom.SchoolClassID.isin([ClassID])]['InDegree'],
                                                   y=df_participant_com[df_participant_com.SchoolClassID.isin([ClassID])]['InDegree'],
                                                   alternative = 'two-sided') )



78  InDegree MannwhitneyuResult(statistic=272.0, pvalue=0.0004317016026544767)
81  InDegree MannwhitneyuResult(statistic=323.0, pvalue=2.91380439543227e-05)
100  InDegree MannwhitneyuResult(statistic=275.0, pvalue=0.005536128690929143)
124  InDegree MannwhitneyuResult(statistic=261.0, pvalue=5.9101832978284183e-05)
125  InDegree MannwhitneyuResult(statistic=212.0, pvalue=0.00149488619470582)
256  InDegree MannwhitneyuResult(statistic=390.5, pvalue=0.0003843658684147711)
258  InDegree MannwhitneyuResult(statistic=208.0, pvalue=0.0023273251006213666)
261  InDegree MannwhitneyuResult(statistic=176.5, pvalue=0.27360389537693497)
262  InDegree MannwhitneyuResult(statistic=119.0, pvalue=0.7978799100852163)
263  InDegree MannwhitneyuResult(statistic=124.5, pvalue=0.03908362996957283)
272  InDegree MannwhitneyuResult(statistic=231.5, pvalue=0.24546555964180528)
273  InDegree MannwhitneyuResult(statistic=381.5, pvalue=0.053031859304928396)
292  InDegree MannwhitneyuResult(statistic=54.5, pvalue

In [170]:
# Closeness
for ClassID in df_participant_nom.SchoolClassID.unique():
    print(ClassID, " Closeness", stats.mannwhitneyu(x=df_participant_nom_soc[df_participant_nom.SchoolClassID.isin([ClassID])]['Closeness'],
                                                    y=df_participant_com[df_participant_com.SchoolClassID.isin([ClassID])]['Closeness'],
                                                    alternative = 'two-sided') )



78  Closeness MannwhitneyuResult(statistic=255.5, pvalue=0.003229085550398658)
81  Closeness MannwhitneyuResult(statistic=345.0, pvalue=1.6265298661355777e-06)
100  Closeness MannwhitneyuResult(statistic=240.0, pvalue=0.08426436739957147)
124  Closeness MannwhitneyuResult(statistic=282.5, pvalue=1.866087240260366e-06)
125  Closeness MannwhitneyuResult(statistic=210.5, pvalue=0.0018951674615018121)
256  Closeness MannwhitneyuResult(statistic=259.5, pvalue=0.6891190517825323)
258  Closeness MannwhitneyuResult(statistic=188.5, pvalue=0.023469782234543105)
261  Closeness MannwhitneyuResult(statistic=87.0, pvalue=0.04825843084912617)
262  Closeness MannwhitneyuResult(statistic=40.0, pvalue=0.0027829433055227195)
263  Closeness MannwhitneyuResult(statistic=124.5, pvalue=0.040851697293104644)
272  Closeness MannwhitneyuResult(statistic=202.0, pvalue=0.07625007521243388)
273  Closeness MannwhitneyuResult(statistic=402.0, pvalue=0.018274390389591035)
292  Closeness MannwhitneyuResult(statistic=

In [172]:
# Betweenness
for ClassID in df_participant_nom.SchoolClassID.unique():
    print(ClassID, " Betweenness", stats.mannwhitneyu(x=df_participant_nom_soc[df_participant_nom.SchoolClassID.isin([ClassID])]['Betweenness'],
                                                      y=df_participant_com[df_participant_com.SchoolClassID.isin([ClassID])]['Betweenness'],
                                                      alternative = 'two-sided') )

78  Betweenness MannwhitneyuResult(statistic=169.5, pvalue=0.8237755259641628)
81  Betweenness MannwhitneyuResult(statistic=212.0, pvalue=0.3622752795365599)
100  Betweenness MannwhitneyuResult(statistic=144.5, pvalue=0.2999828200366239)
124  Betweenness MannwhitneyuResult(statistic=122.0, pvalue=0.44845735060097736)
125  Betweenness MannwhitneyuResult(statistic=141.5, pvalue=0.6230480616677492)
256  Betweenness MannwhitneyuResult(statistic=178.5, pvalue=0.13581747252290333)
258  Betweenness MannwhitneyuResult(statistic=142.0, pvalue=0.5982055318412838)
261  Betweenness MannwhitneyuResult(statistic=85.0, pvalue=0.042010967085694066)
262  Betweenness MannwhitneyuResult(statistic=137.0, pvalue=0.31636346001862237)
263  Betweenness MannwhitneyuResult(statistic=143.0, pvalue=0.12634166389801896)
272  Betweenness MannwhitneyuResult(statistic=319.0, pvalue=0.5294037439082268)
273  Betweenness MannwhitneyuResult(statistic=307.0, pvalue=0.7028439358015917)
292  Betweenness MannwhitneyuResult(s

In [106]:
stats.shapiro(df_participant_nom['Average_Weight'])

ShapiroResult(statistic=0.9436861276626587, pvalue=2.508977574311455e-11)

In [107]:
stats.shapiro(df_participant_com['Average_Weight'])

ShapiroResult(statistic=0.6998010873794556, pvalue=2.3803985667120917e-26)

In [108]:
fig = px.histogram(df_participant_nom,
                   x="Average_Weight",
                   nbins=100)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="Connection Weight", yaxis_title="N"
    #yaxis=dict(type='log')
)
fig.show()

In [109]:
fig = px.histogram(df_participant_nom_soc,
                   x="Average_Weight",
                   nbins=100)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="Connection Weight", yaxis_title="N"
    #yaxis=dict(type='log')
)
fig.show()

In [110]:
fig = px.histogram(df_participant_com,
                   x="Average_Weight",
                   nbins=100)
fig.update_layout(
    width=1000,
    height = 600,
    xaxis_title="Connection Weight", yaxis_title="N"
    #yaxis=dict(type='log')
)
fig.show()