In [144]:
import pandas as pd
import numpy as np
import json
import networkx as nx


# Data

In [192]:
# Read file: selected participants
input_arg = json.loads(open('../input/participants.json').read())
input_args = json.loads(open('../input/simulation.json').read())
selected_participants = input_arg['participants']

#### Peer-nomination network

In [240]:
# Read file
nom_data = '../data/W5_Sociometric_long.csv'
df_nom = pd.read_csv(nom_data, sep=';', header=0)


# Only include selected participants (see selection_participants.ipynb)
df_nom = df_nom[df_nom.Child.isin(selected_participants)]

# Only include relevant questions
questions = ['GEN_Advice', 'GEN_Leader', 'GEN_Social_Facilitation','GEN_Want2B']
df_nom = df_nom[df_nom.Variable.isin(questions)]


#### Online communication network

In [241]:
# Read file
primary_school = '../data/Buzz_W5_primary-schools.csv'
secondary_school = '../data/Buzz_W5_secondary-schools.csv'
df_com = pd.read_csv(primary_school, sep=';', header=0, encoding='latin-1')
df_com = df_com.append(pd.read_csv(secondary_school, sep=';', header=0, encoding='latin-1'))
df_com = df_com[df_com.Type=='my']

# Only include selected participants (see selection_participants.ipynb)
df_com = df_com[df_com.Child.isin(selected_participants)]

df_com

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Participant,School,Class,Child,Gender,Age,PI,Date,Time,UMID,...,ShareSource,SharedVia,Number_likes,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
9508,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:14,59677,...,,,0,,,,,,,
9509,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:15,59689,...,,,0,,,,,,,
9510,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:15,59693,...,,,0,,,,,,,
9511,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:15,59695,...,,,0,,,,,,,
9514,5629605863,56.0,296.0,5863.0,1.0,10.0,0.0,27-2-2018,15:32,59909,...,,,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7879,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,8-3-2018,16:41,98196,...,,,0,,,,,,,
7880,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,8-3-2018,16:41,98200,...,,,0,,,,,,,
7881,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,9-3-2018,7:10,100592,...,,,0,,,,,,,
7882,5425805099,54.0,258.0,5099.0,1.0,12.0,0.0,9-3-2018,7:11,100596,...,,,0,,,,,,,


#### Age

In [242]:
# Mean age
class_df_com = df_com[['Child', 'Class', 'Age']].drop_duplicates()
class_df_com.groupby(['Class'])['Age'].describe()

class_df_com['Age'].describe()

count    409.000000
mean      10.608802
std        1.030667
min        9.000000
25%       10.000000
50%       11.000000
75%       11.000000
max       14.000000
Name: Age, dtype: float64

#### Sex

In [243]:
# Count Female
class_df_com = df_com[['Child', 'Class', 'Gender']].drop_duplicates()
class_df_com.groupby(['Class']).sum()['Gender'].reset_index()



Unnamed: 0,Class,Gender
0,78.0,10.0
1,81.0,10.0
2,100.0,7.0
3,124.0,11.0
4,125.0,9.0
5,256.0,12.0
6,258.0,10.0
7,261.0,6.0
8,262.0,7.0
9,263.0,13.0



#### Physical activity level (PAL)



In [244]:
# Read file: PAL
pal_file = "../data/Fitbit_Imputation_FINAL.csv"

df_pal = pd.read_csv(pal_file, sep=';', header=0, encoding='latin-1')

# select wave 5
df_pal2 = df_pal[df_pal['Wave']==5]

# calculate mean steps
df_pal2 = df_pal2.groupby(['Child', 'Wave']).mean()['Steps'].reset_index()
df_pal2['Steps_scaled'] = df_pal2.Steps * 0.0001 #0.000153

# select participants
df_pal3 = df_pal2[df_pal2.Child.isin(selected_participants)]

# merge with peer-nomination network and online communication data
df_nom2 = pd.merge(df_nom, df_pal2, on='Child')
df_com2 = pd.merge(df_com, df_pal2, on='Child')

df_pal3.pivot(index='Child', columns='Wave')['Steps_scaled']

# Mean PAL by class: use df_com3, not df_nom3, because 2 participants in df_nom3 have no contacts.
class_df_com = df_com2[['Child', 'Class','Steps_scaled']].drop_duplicates()
class_df_com.groupby(['Class'])['Steps_scaled'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,18.0,0.87016,0.286393,0.507625,0.630245,0.821665,1.036875,1.413375
81.0,19.0,0.794184,0.212885,0.185367,0.686087,0.79814,0.8889,1.1432
100.0,19.0,0.877027,0.340317,0.2094,0.6992,0.94292,1.06757,1.43036
124.0,17.0,0.990502,0.299465,0.52276,0.76878,1.058133,1.2123,1.54082
125.0,16.0,0.98172,0.246594,0.531433,0.805988,0.98827,1.17862,1.4047
256.0,22.0,0.785191,0.322836,0.34905,0.570335,0.696653,0.963208,1.86738
258.0,16.0,1.01516,0.294962,0.55075,0.80043,1.06916,1.201885,1.47884
261.0,17.0,1.041116,0.257415,0.59844,0.84146,1.06765,1.214633,1.421267
262.0,15.0,0.928219,0.306408,0.54215,0.669265,0.887325,1.119133,1.56176
263.0,20.0,0.80507,0.175668,0.468967,0.705521,0.812998,0.917613,1.115


In [245]:
# Overall Mean PAL score
class_df_com['Steps_scaled'].describe()

count    408.000000
mean       0.919938
std        0.313788
min        0.122750
25%        0.704670
50%        0.911460
75%        1.114670
max        1.867380
Name: Steps_scaled, dtype: float64

#### Family Affluence Score (FAS)

In [246]:
# Read file: environmental score
env_file = "../data/W6_Main_Questions.csv"
df_env = pd.read_csv(env_file, sep=';', header=0, encoding='latin-1')

# select FAS questions
df_env = df_env[['Child','GEN_FAS_computer_A01' ,'GEN_FAS_car_A01', 'GEN_FAS_vacation_A01', 'GEN_FAS_ownroom_A01']]

# calculate FAS score
df_env['FAS_score'] = df_env['GEN_FAS_computer_A01'] + df_env['GEN_FAS_vacation_A01'] + df_env['GEN_FAS_car_A01']*1.5 + df_env['GEN_FAS_ownroom_A01']*3

# to keep the values between 0 and 2.
df_env['FAS_scaled'] = abs((df_env.FAS_score/6) - 2)

# select participants
df_env2 = df_env[df_env.Child.isin(selected_participants)]

print("Number of participants with missing FAS score:", np.isnan(df_env2['FAS_scaled']).sum())

# missing will be replaced by overall average value
mean_FAS = df_env2['FAS_scaled'].mean()
print("Mean FAS score:",mean_FAS) # different than original because 518 participants instead of 408
for i in df_env2.index:
    if np.isnan(df_env2.at[i,'FAS_scaled']):
        df_env2.at[i,'FAS_scaled'] = mean_FAS

df_env3 = df_env2[['Child', 'FAS_scaled']]

# merge with peer-nomination network and online communication data
df_nom3 = pd.merge(df_nom2, df_env3, on='Child')
df_com3 = pd.merge(df_com2, df_env3, on='Child')

# Mean FAS by class: use df_com3, not df_nom3, because 2 participants in df_nom3 have no contacts.
class_df_com = df_com3[['Child', 'Class','FAS_scaled']].drop_duplicates()
class_df_com.groupby(['Class'])['FAS_scaled'].describe()

Number of participants with missing FAS score: 41
Mean FAS score: 0.48819255222524977


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,18.0,0.588657,0.363662,0.0,0.434548,0.488193,0.854167,1.5
81.0,19.0,0.478727,0.269947,0.0,0.333333,0.488193,0.583333,1.166667
100.0,19.0,0.564547,0.407132,0.0,0.333333,0.5,0.625,1.666667
124.0,17.0,0.528023,0.344927,0.0,0.333333,0.488193,0.75,1.083333
125.0,16.0,0.495572,0.263589,0.0,0.395833,0.488193,0.583333,1.166667
256.0,22.0,0.372853,0.174295,0.0,0.270833,0.416667,0.497048,0.583333
258.0,16.0,0.321441,0.332346,0.0,0.0,0.333333,0.434548,1.25
261.0,17.0,0.460784,0.255355,0.0,0.333333,0.5,0.583333,0.833333
262.0,15.0,0.470648,0.298125,0.0,0.333333,0.488193,0.541667,1.0
263.0,20.0,0.38691,0.25946,0.0,0.333333,0.333333,0.5,1.083333


In [247]:
# Overall Mean FAS score
class_df_com['FAS_scaled'].describe()

count    408.000000
mean       0.488193
std        0.331450
min        0.000000
25%        0.333333
50%        0.488193
75%        0.666667
max        1.666667
Name: FAS_scaled, dtype: float64

# Create peer-nomination network

In [250]:
# Only select alters present in the class of the child (i.e. participant)
df_nom4 = []
for cl in df_nom3['Class'].unique():
    df_temp = df_nom3[df_nom3.Class.isin([cl])]
    children_in_class = df_temp.Child.unique()
    df_temp = df_temp[df_temp.Alter.isin(children_in_class)]
    df_nom4.append(df_temp)

df_nom4 = pd.concat(df_nom4)
df_nom3.Child.unique()

array([1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1614, 1615, 1617,
       1620, 1623, 1625, 1626, 1627, 1630, 1631, 1632, 1633, 1636, 1637,
       1639, 1640, 1641, 1642, 1645, 1651, 1653, 1654, 1656, 1717, 1719,
       1723, 1733, 1736, 2262, 2263, 2265, 2266, 2267, 2268, 2270, 2271,
       2275, 2276, 2277, 2278, 2280, 2281, 2282, 2283, 2858, 2859, 2860,
       2862, 2863, 2865, 2866, 2868, 2869, 2872, 2873, 2875, 2877, 2880,
       2882, 2883, 2885, 2886, 2887, 2890, 2891, 2894, 2897, 2898, 2899,
       2900, 2901, 2902, 2903, 2904, 2905, 2906, 2908, 3317, 4364, 4425,
       4427, 5011, 5013, 5014, 5015, 5016, 5017, 5018, 5020, 5021, 5022,
       5023, 5024, 5027, 5028, 5029, 5031, 5032, 5033, 5034, 5038, 5039,
       5040, 5070, 5071, 5075, 5076, 5078, 5080, 5081, 5083, 5084, 5085,
       5087, 5092, 5094, 5095, 5096, 5099, 5157, 5159, 5161, 5162, 5163,
       5164, 5165, 5166, 5167, 5168, 5169, 5170, 5174, 5176, 5177, 5179,
       5181, 5182, 5184, 5187, 5189, 5190, 5191, 51

In [264]:
# Create the edges in the graph
peer_nominations_network = nx.DiGraph()

# Read questions to use
formula = json.loads(open('../input/connections_gen.json').read())

# Sum of all questions
max_score = sum(formula.values())

# Create a dictionary with the connections and weights
connections_dict = {}
for child in selected_participants:
    connections_dict[child] = {}

# create newtwork based on unique child ids
for cl in df_nom3['Class'].unique():
    children = df_nom3[df_nom3.Class.isin([cl])].Child.unique()
    for key in children:
        peer_nominations_network.add_node(key)

# To avoid repetition of nominations in different waves
nominations_list = []
for line in df_nom4[['Child', 'Alter', 'Variable']].iterrows():
    (child, alter, var) = line[1]
    # Verify if nominated is in the list of participants (pp)
    if alter in selected_participants and (child, alter, var) not in nominations_list:
        # Add value in the key
        connections_dict[child][alter] = connections_dict[child].get(alter, 0) + 1*formula[var]
        nominations_list.append((child, alter, var))

# Make a dataframe and normalize the values for the edges
connections_df = pd.DataFrame(connections_dict).fillna(0)/max_score
connections_dict = connections_df.to_dict()


df_connections_nomination = []
for node in connections_dict.items():
    child = node[0]
    alter = node[1]
    for alt, weight in alter.items():
        if weight > 0:
            peer_nominations_network.add_edge(child, alt, weight=weight)
            df_connections_nomination.append([child, alt, weight])


df_connections_nomination = pd.DataFrame(df_connections_nomination, columns = ["ChildID", "AlterID", "Weight"])

class_df = df_nom3[['Child', 'Class']].drop_duplicates()
class_df.index = class_df['Child']
class_dict = class_df['Class'].to_dict()



nx.set_node_attributes(peer_nominations_network, class_dict, 'Class')

pd.DataFrame(peer_nominations_network.out_degree(), columns = ['ChildID', "Out-degree"])

#peer_nominations_network.number_of_nodes()

#df_connections_nomination

OutDegreeView({1605: 18, 1606: 13, 1607: 8, 1608: 18, 1609: 7, 1610: 13, 1611: 10, 1612: 8, 1614: 5, 1615: 3, 1617: 15, 1620: 6, 1623: 5, 1625: 8, 1626: 16, 1627: 6, 1630: 16, 1631: 3, 1632: 6, 1633: 5, 1636: 16, 1637: 5, 1639: 8, 1640: 0, 1641: 7, 1642: 3, 1645: 4, 1651: 5, 1653: 5, 1654: 12, 1656: 6, 1717: 5, 1719: 4, 1723: 5, 1733: 6, 1736: 11, 4364: 7, 2262: 11, 2263: 8, 2266: 6, 2267: 10, 2268: 4, 2270: 10, 2271: 18, 2275: 10, 2276: 17, 2277: 5, 2278: 15, 2280: 8, 2281: 9, 2282: 5, 2283: 8, 3317: 1, 4425: 11, 4427: 2, 6030: 13, 2265: 7, 5972: 5, 5973: 4, 5975: 8, 5979: 7, 5980: 14, 5981: 10, 5982: 13, 5983: 2, 5984: 7, 5985: 18, 5988: 6, 5991: 3, 5992: 11, 5993: 9, 5994: 5, 5995: 18, 5996: 18, 5997: 7, 2858: 14, 2859: 10, 2860: 14, 2862: 13, 2863: 13, 2865: 9, 2866: 6, 2868: 9, 2869: 4, 2872: 16, 2873: 10, 2875: 10, 2877: 6, 2880: 13, 2882: 14, 2883: 16, 2885: 4, 2886: 4, 2887: 11, 2890: 5, 2891: 15, 2894: 6, 2897: 4, 2898: 11, 2899: 3, 2900: 15, 2901: 14, 2902: 2, 2903: 4, 2904: 

# Create Online Communication Network

In [253]:
# Rename RecipientChild ID
alter = []
for part in df_com3['RecipientChild']:
    alter.append(float(str(part)[-6:]))

df_com3['Alter'] = alter
df_com3[['Class', 'Child', 'Participant', 'RecipientChild', 'Alter']]


# Only select alters present in the class of the child (i.e. participant)
df_com4 = []
for cl in df_com3['Class'].unique():
    df_temp = df_com3[df_com3.Class.isin([cl])]
    children_in_class = df_temp.Child.unique()
    df_temp = df_temp[df_temp.Alter.isin(children_in_class)]
    df_com4.append(df_temp)

df_com4 = pd.concat(df_com4)
df_com3.Child.unique()


array([5863., 5864., 5865., 5867., 5868., 5869., 5871., 5872., 5874.,
       5875., 5878., 5879., 5880., 5882., 5885., 5886., 5887., 5893.,
       5894., 5897., 5898., 5901., 5905., 5906., 5907., 5911., 5914.,
       5915., 5916., 5918., 5919., 5920., 5922., 2262., 2263., 2266.,
       2267., 2268., 2270., 2271., 2275., 2276., 2277., 2278., 2280.,
       2281., 2282., 2283., 3317., 4425., 4427., 6030., 2265., 5972.,
       5973., 5975., 5979., 5980., 5981., 5982., 5983., 5984., 5985.,
       5988., 5991., 5992., 5993., 5994., 5995., 5996., 5997., 5976.,
       5999., 6001., 6002., 6003., 6005., 6006., 6007., 6010., 6011.,
       6012., 6013., 6015., 6016., 6017., 6019., 6020., 6021., 6023.,
       6024., 6025., 6026., 6028., 6029., 2858., 2859., 2860., 2862.,
       2863., 2865., 2866., 2868., 2869., 2872., 2873., 2875., 2877.,
       2880., 2882., 2883., 2885., 5945., 5946., 5949., 5950., 5951.,
       5952., 5954., 5955., 5956., 5957., 5958., 5959., 5960., 5961.,
       5962., 5964.,

In [254]:
# Create Network

online_communication_network = nx.DiGraph()

df_connections_communication = []
#create the connections - edges
for cl in df_com4['Class'].unique():
    # participants in a class
    df_temp = df_com4[df_com4.Class.isin([cl])]

    children = df_com3[df_com3.Class.isin([cl])].Child.unique()
    for key in children:
        online_communication_network.add_node(key)

    # Number of messages sent by Participant to Recipient Child
    df_temp = df_temp.groupby(["Child","Alter"])["UMID"].count().reset_index(name='NumberMessages')

    # Weight: Number of messages per participant divided by total number of messages sent in a class
    df_temp['Weight'] = df_temp['NumberMessages']/max(df_temp['NumberMessages'])

    for index, row in df_temp.iterrows():
        online_communication_network.add_edge(row[0], row[1], weight=row[3])
        df_connections_communication.append([row[0], row[1], row[3]])

# class per participant
class_df = df_com3[['Child', 'Class']].drop_duplicates()
class_df.index = class_df['Child']
class_dict = class_df['Class'].to_dict()

nx.set_node_attributes(online_communication_network, class_dict, 'Class')

df_connections_communication = pd.DataFrame(df_connections_communication, columns = ["ChildID", "AlterID", "Weight"])

df_connections_communication

online_communication_network.number_of_nodes()

408

# Peer-nomination Descriptives

In [278]:
# Number of nominations
df_number_nominations_given = df_nom4.groupby(['Class','Child'])['Variable'].count().reset_index(name="NumNominations")
df_number_nominations_given['NumNominations'].describe()

df_nom4

Unnamed: 0,School,Class,Child,Gender,Age,PI,Wave_x,Variable,Alter,Wave_y,Steps,Steps_scaled,FAS_scaled
0,29,81,1605,1,14,0,5,GEN_Advice,1606,5,6744.4,0.67444,0.583333
2,29,81,1605,1,14,0,5,GEN_Advice,1617,5,6744.4,0.67444,0.583333
4,29,81,1605,1,14,0,5,GEN_Leader,1606,5,6744.4,0.67444,0.583333
5,29,81,1605,1,14,0,5,GEN_Leader,1607,5,6744.4,0.67444,0.583333
6,29,81,1605,1,14,0,5,GEN_Leader,1608,5,6744.4,0.67444,0.583333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8259,58,303,6090,0,11,0,5,GEN_Social_Facilitation,6077,5,10386.8,1.03868,0.583333
8260,58,303,6090,0,11,0,5,GEN_Social_Facilitation,6078,5,10386.8,1.03868,0.583333
8262,58,303,6090,0,11,0,5,GEN_Social_Facilitation,6085,5,10386.8,1.03868,0.583333
8263,58,303,6090,0,11,0,5,GEN_Social_Facilitation,6086,5,10386.8,1.03868,0.583333


In [267]:
# peer-nominations by class
df_number_nominations_given.groupby('Class')['NumNominations'].sum().reset_index(name="TotalNominations")



Unnamed: 0,Class,TotalNominations
0,78,148
1,81,288
2,100,297
3,124,331
4,125,207
5,256,270
6,258,156
7,261,242
8,262,146
9,263,276


In [257]:
# Total peer-nominations
df_number_nominations_given['NumNominations'].sum()

8265

In [161]:
# Mean number of connections per class
#df_number_nominations_given.groupby('Class')['NumNominations'].describe()
df_nom_connections = df_nom3.groupby(['Class', 'Child', 'Alter']).size().reset_index(name="Size")
df_nom_connections2 = df_nom_connections.groupby(['Class', 'Child']).size().reset_index(name="NumConnections")
df_nom_connections2.groupby('Class')['NumConnections'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,17.0,6.529412,3.023195,3.0,5.0,5.0,7.0,15.0
81,19.0,9.684211,5.044828,3.0,6.0,8.0,14.0,18.0
100,19.0,9.0,4.654747,1.0,5.5,9.0,11.0,18.0
124,17.0,10.647059,3.920159,4.0,9.0,10.0,14.0,16.0
125,16.0,8.25,4.464676,2.0,4.0,7.5,11.25,15.0
256,22.0,7.0,4.070802,1.0,4.25,6.0,8.0,21.0
258,15.0,6.066667,2.685056,1.0,4.0,7.0,8.0,9.0
261,17.0,8.058824,4.892221,1.0,4.0,9.0,10.0,16.0
262,15.0,5.533333,1.45733,3.0,5.0,5.0,6.0,9.0
263,20.0,8.15,4.05586,3.0,5.75,7.5,10.25,19.0


In [162]:
# Mean number of connections Total
df_nom_connections2['NumConnections'].describe()


count    405.000000
mean       9.249383
std        5.049216
min        1.000000
25%        5.000000
50%        8.000000
75%       12.000000
max       24.000000
Name: NumConnections, dtype: float64

In [273]:
# Weight of connections

class_list = input_args['classes']

class_dictionary = {}
for c in class_list:
    class_dictionary[c] = []

for node, key in peer_nominations_network.nodes.data('Class'):
    if key in class_dictionary:
        class_dictionary[int(key)].append(node)

list_subgraphs_nom = []
for c in class_list:
    subgraph = peer_nominations_network.subgraph(class_dictionary[c]).copy()
    subgraph.graph['Class']=c
    list_subgraphs_nom.append(subgraph)

population_list = []
node_data_list = []
for subgraph in list_subgraphs_nom:
    dict_in_degree = dict(nx.in_degree_centrality(subgraph))
    dict_closeness = dict(nx.closeness_centrality(subgraph))
    dict_betweenness = dict(nx.betweenness_centrality(subgraph))
    total_agents = subgraph.number_of_nodes()

    for nodedata in subgraph.nodes().data():
        #calculating the average weight
        num_edges = len(subgraph.edges(nodedata[0],data=True))
        avg_weight = 0
        if(num_edges > 0):
            for (u, v, wt) in subgraph.edges(nodedata[0],data=True):
                avg_weight = avg_weight +  wt['weight']

            avg_weight = avg_weight/num_edges

        #participant-level data
        node_data_list.append([nodedata[0], nodedata[1]['Class'], num_edges, dict_in_degree[nodedata[0]], dict_closeness[nodedata[0]],dict_betweenness[nodedata[0]],avg_weight])

    #population level data
    population_list.append([nodedata[1]['Class'], subgraph.number_of_edges(), round(nx.density(subgraph),2)])

#create the dataframes
df_class_nom = pd.DataFrame(population_list, columns = ["SchoolClassID", "NumberConnections", "Density"])

df_participant_nom = pd.DataFrame(node_data_list, columns = ["ParticipantID","SchoolClassID", "num_edges", "InDegree", "Closeness", "Betweenness", "Average_Weight"])


In [277]:

df_participant_nom.groupby('SchoolClassID')['num_edges'].sum()

df_participant_nom['num_edges'].sum()



3760

In [271]:
# Weight of connection by class
df_participant_nom.groupby('SchoolClassID')['Average_Weight'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,18.0,0.31007,0.095897,0.0,0.287202,0.3,0.3875,0.416667
81,19.0,0.383636,0.06192,0.25,0.346875,0.392857,0.416667,0.513889
100,19.0,0.415977,0.106049,0.25,0.337121,0.375,0.509615,0.625
124,17.0,0.448661,0.090836,0.285714,0.384615,0.45,0.5,0.625
125,16.0,0.377755,0.076806,0.25,0.320617,0.375,0.412202,0.566667
256,22.0,0.439558,0.110731,0.25,0.358766,0.432292,0.51875,0.666667
258,16.0,0.357999,0.128956,0.0,0.334201,0.366071,0.411111,0.583333
261,17.0,0.403617,0.130615,0.25,0.3125,0.375,0.454545,0.765625
262,15.0,0.434021,0.12202,0.25,0.3625,0.4,0.506944,0.708333
263,20.0,0.434598,0.131843,0.25,0.331731,0.420833,0.476562,0.729167


In [270]:
# Weight of connection (total)
df_participant_nom['Average_Weight'].describe()


count    408.000000
mean       0.398581
std        0.108919
min        0.000000
25%        0.326923
50%        0.375000
75%        0.450781
max        0.765625
Name: Average_Weight, dtype: float64

In [166]:
# density by class
df_class_nom.groupby('SchoolClassID')['Density'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78,1.0,0.41,,0.41,0.41,0.41,0.41,0.41
81,1.0,0.54,,0.54,0.54,0.54,0.54,0.54
100,1.0,0.5,,0.5,0.5,0.5,0.5,0.5
124,1.0,0.67,,0.67,0.67,0.67,0.67,0.67
125,1.0,0.55,,0.55,0.55,0.55,0.55,0.55
256,1.0,0.33,,0.33,0.33,0.33,0.33,0.33
258,1.0,0.43,,0.43,0.43,0.43,0.43,0.43
261,1.0,0.5,,0.5,0.5,0.5,0.5,0.5
262,1.0,0.4,,0.4,0.4,0.4,0.4,0.4
263,1.0,0.43,,0.43,0.43,0.43,0.43,0.43


In [167]:
# density by class (overall)
df_class_nom['Density'].describe()

count    21.000000
mean      0.493333
std       0.086967
min       0.330000
25%       0.430000
50%       0.500000
75%       0.550000
max       0.670000
Name: Density, dtype: float64

In [168]:
# centrality measures
df_participant_nom.groupby('SchoolClassID')['InDegree', 'Closeness', 'Betweenness'].describe()

  


Unnamed: 0_level_0,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,Closeness,Closeness,Closeness,Closeness,Closeness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
78,17.0,0.408088,0.110745,0.25,0.375,0.375,0.4375,0.625,17.0,0.611954,...,0.64,0.727273,17.0,0.043137,0.069128,0.001058,0.013204,0.023006,0.039018,0.300366
81,19.0,0.538012,0.108064,0.388889,0.444444,0.555556,0.638889,0.722222,19.0,0.685283,...,0.735,0.782609,19.0,0.02752,0.024832,0.002682,0.008896,0.01481,0.043487,0.080444
100,19.0,0.5,0.159302,0.166667,0.416667,0.5,0.611111,0.722222,19.0,0.647174,...,0.679487,0.782609,19.0,0.033368,0.045729,0.0,0.006401,0.020732,0.037771,0.175
124,17.0,0.665441,0.192563,0.125,0.625,0.6875,0.8125,0.875,17.0,0.762336,...,0.842105,0.888889,17.0,0.022304,0.02482,0.0,0.004282,0.014606,0.031238,0.081713
125,16.0,0.55,0.140897,0.333333,0.466667,0.533333,0.6,0.866667,16.0,0.693391,...,0.714286,0.882353,16.0,0.032738,0.038255,0.002937,0.005813,0.018475,0.044473,0.138764
256,22.0,0.333333,0.08939,0.142857,0.285714,0.333333,0.380952,0.47619,22.0,0.516636,...,0.567568,0.65625,22.0,0.049134,0.063311,0.0,0.004663,0.008869,0.096976,0.169708
258,15.0,0.433333,0.239147,0.142857,0.142857,0.571429,0.642857,0.642857,15.0,0.43255,...,0.56,0.666667,15.0,0.046886,0.073029,0.0,0.000785,0.0045,0.0691,0.236264
261,17.0,0.503676,0.125916,0.25,0.4375,0.5,0.5625,0.75,17.0,0.596991,...,0.615385,0.761905,17.0,0.045833,0.073774,0.000521,0.007986,0.010243,0.0625,0.288681
262,15.0,0.395238,0.217437,0.142857,0.25,0.357143,0.5,0.928571,15.0,0.548941,...,0.666667,0.933333,15.0,0.076923,0.079108,0.0,0.013965,0.059524,0.116712,0.272253
263,20.0,0.428947,0.134754,0.210526,0.355263,0.421053,0.539474,0.684211,20.0,0.593646,...,0.638793,0.730769,20.0,0.039766,0.045039,0.0,0.007432,0.021273,0.054431,0.169087


In [169]:
df_participant_nom[['InDegree', 'Closeness', 'Betweenness']].describe()

Unnamed: 0,InDegree,Closeness,Betweenness
count,405.0,405.0,405.0
mean,0.492288,0.637592,0.033302
std,0.16352,0.112681,0.04472
min,0.1,0.163265,0.0
25%,0.388889,0.571429,0.006311
50%,0.5,0.642857,0.016353
75%,0.608696,0.705882,0.04053
max,1.0,1.0,0.300366


# Online Communication Descriptives

In [170]:
# Number of messages by c;ass

df_number_messages = df_com3.groupby(['Class', 'Child'])['UMID'].count().reset_index(name="NumMessages")
df_number_messages['NumMessages'].describe()


count    403.000000
mean      63.868486
std      104.088367
min        1.000000
25%        9.000000
50%       23.000000
75%       69.500000
max      786.000000
Name: NumMessages, dtype: float64

In [171]:
# Number of messages by class

df_number_messages.groupby(['Class'])['NumMessages'].sum().reset_index(name="TotalMessages")
#df_number_messages.groupby(['Class'])['NumMessages'].describe()


Unnamed: 0,Class,TotalMessages
0,78.0,221
1,81.0,143
2,100.0,569
3,124.0,356
4,125.0,561
5,256.0,259
6,258.0,168
7,261.0,448
8,262.0,263
9,263.0,1013


In [172]:
# Total number of messages
df_number_messages['NumMessages'].sum()


25739

In [173]:
# Mean number of connections per class

df_com_connections = df_com3.groupby(['Class', 'Child', 'Alter']).size().reset_index(name="Size")
df_com_connections2 = df_com_connections.groupby(['Class', 'Child']).size().reset_index(name="NumConnections")
df_com_connections2.groupby('Class')['NumConnections'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,16.0,2.625,1.360147,1.0,1.75,2.5,3.25,5.0
81.0,18.0,3.944444,4.491459,1.0,1.25,2.0,5.0,18.0
100.0,19.0,4.842105,2.651271,1.0,3.0,5.0,7.0,10.0
124.0,17.0,4.470588,3.280961,1.0,2.0,4.0,6.0,15.0
125.0,16.0,4.5625,4.618351,1.0,1.75,2.5,5.25,15.0
256.0,21.0,3.095238,1.578124,1.0,2.0,3.0,5.0,6.0
258.0,15.0,2.2,1.612452,1.0,1.0,2.0,2.5,7.0
261.0,17.0,5.294118,3.584074,2.0,3.0,5.0,6.0,16.0
262.0,15.0,4.133333,3.020564,1.0,3.0,3.0,4.5,14.0
263.0,20.0,8.15,4.637093,2.0,6.0,7.0,9.25,19.0


In [174]:
# Mean number of connections Total
df_com_connections2['NumConnections'].describe()

count    403.000000
mean       7.746898
std        6.017839
min        1.000000
25%        3.000000
50%        6.000000
75%       11.000000
max       24.000000
Name: NumConnections, dtype: float64

In [175]:
# Subgraphs

class_list = input_args['classes']

class_dictionary = {}
for c in class_list:
    class_dictionary[c] = []

for node, key in online_communication_network.nodes.data('Class'):
    if key in class_dictionary:
        class_dictionary[int(key)].append(node)

list_subgraphs_com = []
for c in class_list:
    subgraph = online_communication_network.subgraph(class_dictionary[c]).copy()
    subgraph.graph['Class']=c
    list_subgraphs_com.append(subgraph)


population_list = []
node_data_list = []
for subgraph in list_subgraphs_com:

    dict_in_degree = dict(nx.in_degree_centrality(subgraph))
    dict_closeness = dict(nx.closeness_centrality(subgraph))
    dict_betweenness = dict(nx.betweenness_centrality(subgraph))

    total_agents = subgraph.number_of_nodes()

    for nodedata in subgraph.nodes().data():

        #calculating the average weight
        num_edges = len(subgraph.edges(nodedata[0],data=True))
        avg_weight = 0
        if(num_edges > 0):
            for (u, v, wt) in subgraph.edges(nodedata[0],data=True):
                avg_weight = avg_weight +  wt['weight']

            avg_weight = avg_weight/num_edges

        #participant-level data
        node_data_list.append([nodedata[0], nodedata[1]['Class'], dict_in_degree[nodedata[0]], dict_closeness[nodedata[0]],dict_betweenness[nodedata[0]],avg_weight])

    #population level data
    population_list.append([nodedata[1]['Class'], subgraph.number_of_edges(), round(nx.density(subgraph),2)])

#create the dataframes
df_class_com = pd.DataFrame(population_list, columns = ["SchoolClassID", "NumberConnections", "Density"])

df_participant_com = pd.DataFrame(node_data_list, columns = ["ParticipantID","SchoolClassID", "InDegree", "Closeness", "Betweenness", "Average_Weight"])



In [176]:
# Weight of connections by class
df_participant_com.groupby('SchoolClassID')['Average_Weight'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,16.0,0.161925,0.090437,0.034483,0.097701,0.146552,0.215517,0.317241
81.0,18.0,0.328641,0.195906,0.125,0.178125,0.3125,0.375,0.875
100.0,19.0,0.061455,0.054753,0.012658,0.021926,0.046414,0.081224,0.222423
124.0,17.0,0.073921,0.08085,0.013699,0.018265,0.041096,0.082192,0.30137
125.0,16.0,0.063141,0.072895,0.009709,0.01699,0.031715,0.075243,0.24164
256.0,21.0,0.253373,0.198257,0.0625,0.145833,0.1875,0.3375,0.875
258.0,15.0,0.182143,0.187514,0.0,0.052083,0.083333,0.282738,0.666667
261.0,17.0,0.103337,0.09575,0.025926,0.033333,0.044444,0.207407,0.311111
262.0,15.0,0.21162,0.203273,0.041667,0.111111,0.152778,0.192708,0.791667
263.0,20.0,0.074188,0.079257,0.014446,0.024579,0.05103,0.07718,0.303371


In [177]:
# Weight of connections (overall)
df_participant_com['Average_Weight'].describe()

count    403.000000
mean       0.104260
std        0.123823
min        0.000000
25%        0.026678
50%        0.057808
75%        0.135374
max        0.875000
Name: Average_Weight, dtype: float64

In [178]:
# Density by class
df_class_com.groupby('SchoolClassID')['Density'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
78.0,1.0,0.17,,0.17,0.17,0.17,0.17,0.17
81.0,1.0,0.23,,0.23,0.23,0.23,0.23,0.23
100.0,1.0,0.27,,0.27,0.27,0.27,0.27,0.27
124.0,1.0,0.28,,0.28,0.28,0.28,0.28,0.28
125.0,1.0,0.3,,0.3,0.3,0.3,0.3,0.3
256.0,1.0,0.15,,0.15,0.15,0.15,0.15,0.15
258.0,1.0,0.15,,0.15,0.15,0.15,0.15,0.15
261.0,1.0,0.33,,0.33,0.33,0.33,0.33,0.33
262.0,1.0,0.3,,0.3,0.3,0.3,0.3,0.3
263.0,1.0,0.43,,0.43,0.43,0.43,0.43,0.43


In [179]:
# Density (overall)
df_class_com['Density'].describe()

count    21.000000
mean      0.387143
std       0.175332
min       0.150000
25%       0.280000
50%       0.350000
75%       0.470000
max       0.810000
Name: Density, dtype: float64

In [180]:
# centrality measures
df_participant_com.groupby('SchoolClassID')['InDegree', 'Closeness', 'Betweenness'].describe()


  


Unnamed: 0_level_0,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,InDegree,Closeness,Closeness,Closeness,Closeness,Closeness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness,Betweenness
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
SchoolClassID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
78.0,16.0,0.170833,0.080623,0.0,0.133333,0.166667,0.2,0.333333,16.0,0.305029,...,0.387146,0.46875,16.0,0.097619,0.085589,0.0,0.01131,0.066667,0.1625,0.230952
81.0,18.0,0.22549,0.105323,0.058824,0.176471,0.176471,0.294118,0.411765,18.0,0.441789,...,0.485714,0.566667,18.0,0.081904,0.113597,0.0,0.0,0.002757,0.127298,0.370711
100.0,19.0,0.269006,0.118956,0.055556,0.166667,0.277778,0.333333,0.5,19.0,0.512841,...,0.571573,0.6,19.0,0.057792,0.05929,0.0,0.019643,0.037099,0.076354,0.205766
124.0,17.0,0.279412,0.132799,0.0625,0.1875,0.25,0.375,0.5625,17.0,0.501765,...,0.516129,0.666667,17.0,0.067892,0.125116,0.0,0.005556,0.03588,0.063079,0.520764
125.0,16.0,0.304167,0.137639,0.133333,0.183333,0.3,0.4,0.533333,16.0,0.533646,...,0.576923,0.652174,16.0,0.064583,0.098901,0.0,0.0,0.032738,0.06751,0.334218
256.0,21.0,0.154762,0.070542,0.05,0.1,0.15,0.2,0.25,21.0,0.349812,...,0.392157,0.444444,21.0,0.10188,0.123024,0.0,0.001316,0.036184,0.175877,0.363158
258.0,15.0,0.152381,0.120333,0.0,0.071429,0.142857,0.178571,0.5,15.0,0.262596,...,0.340336,0.525974,15.0,0.035165,0.075901,0.0,0.0,0.0,0.021978,0.285714
261.0,17.0,0.330882,0.12454,0.125,0.25,0.3125,0.4375,0.625,17.0,0.553905,...,0.592593,0.727273,17.0,0.054902,0.116188,0.0,0.003413,0.015833,0.052431,0.492599
262.0,15.0,0.295238,0.129099,0.142857,0.214286,0.214286,0.357143,0.571429,15.0,0.498218,...,0.538462,0.608696,15.0,0.079487,0.147824,0.0,0.013324,0.033516,0.077289,0.593407
263.0,20.0,0.428947,0.085848,0.263158,0.368421,0.421053,0.473684,0.631579,20.0,0.599897,...,0.633333,0.703704,20.0,0.037573,0.052842,0.002193,0.006684,0.022494,0.040581,0.195705


In [181]:
df_participant_com[['InDegree','Closeness','Betweenness']].describe()

Unnamed: 0,InDegree,Closeness,Betweenness
count,403.0,403.0,403.0
mean,0.400909,0.57271,0.046881
std,0.211365,0.160373,0.078552
min,0.0,0.0,0.0
25%,0.242647,0.484848,0.003462
50%,0.388889,0.580645,0.01713
75%,0.55,0.671569,0.050439
max,1.0,1.0,0.593407


# Network Similarity (Peer-Nomination vs. Online Communication)

In [182]:
class_network_similarity = []
for i in range(0,21):
    g1 = list_subgraphs_nom[i]
    g2 = list_subgraphs_com[i]


    max_edges = g1.number_of_nodes()**2-g1.number_of_nodes()
    true_positive=set(g1.edges()).intersection(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in GT but not discovered in BT
    false_negative=set(g1.edges()).difference(set(g2.edges()))
    # g1.diff(g2) -> edges that are found in BT but not discovered in GT
    false_positive=set(g2.edges()).difference(set(g1.edges()))
    # not sure if true_negative should be actually 0, union or set(g1.edges()).
    # I think the last makes most sense since it represents the difference between all possible edges and the ones that are actually in the ground truth data, i.e. the nomination-based graph
    union=set(g1.edges()).union(set(g2.edges()))
    true_negative=max_edges-len(union)
    d1=(len(true_positive)+true_negative) if (len(true_positive)+true_negative)>0 else -1
    d2=(len(true_positive)+true_negative+len(false_negative)+len(false_positive)) if (len(true_positive)+true_negative+len(false_negative)+len(false_positive))>0 else -1
    network_similarity=d1/d2

    class_network_similarity.append(network_similarity)

pd.DataFrame(class_network_similarity)

Unnamed: 0,0
0,0.669118
1,0.535088
2,0.652047
3,0.5625
4,0.4875
5,0.755411
6,0.719048
7,0.643382
8,0.747619
9,0.668421


In [183]:
print("Mean:", sum(class_network_similarity)/len(class_network_similarity))

Mean: 0.6472389470546789
