In [2]:
import pandas as pd
import numpy as np
import json
import networkx as nx
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [617]:
try:
    input_args = json.loads(open('../input/simulation.json').read())
except Exception as ex:
    print('simulation.json does not exist!')
    print(ex)

### Exploratory Analysis Movez Data

#### PAL Data

##### PAL per child

In [591]:
fitbit_new = pd.read_csv('../data/Fitbit_Imputation_FINAL.csv', sep=';', header=0)
steps_mean_wave_new = fitbit_new.groupby(['Child', 'Wave']).mean()['Steps'].reset_index()
steps_mean_wave_new.Steps = steps_mean_wave_new.Steps * 0.000153
steps_mean_wave_new = steps_mean_wave_new.pivot(index='Child', columns='Wave')['Steps']

In [553]:
steps_mean_wave_new.describe()

Wave,1,2,3,4,5,6,7
count,926.0,928.0,927.0,929.0,1028.0,782.0,782.0
mean,1.4,1.53,1.59,1.44,1.34,1.61,1.44
std,0.53,0.58,0.6,0.57,0.53,0.65,0.6
min,0.19,0.15,0.15,0.15,0.15,0.2,0.15
25%,1.03,1.12,1.19,1.03,0.97,1.17,1.04
50%,1.36,1.5,1.51,1.4,1.29,1.54,1.41
75%,1.69,1.92,1.93,1.78,1.69,2.0,1.81
max,3.44,4.01,4.07,3.43,3.61,3.95,4.03


##### PAL per class

In [127]:
class_waves_pal = steps_mean_wave_new.groupby(['Class','Wave']).mean()['Steps'].reset_index()
class_waves_pal = class_waves_pal.pivot(index='Class', columns='Wave')['Steps']
class_waves_pal

Wave,1,2,3,4,5,6,7
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
52,1.39,1.46,1.68,1.15,,,
53,1.48,2.34,2.27,3.32,,,
54,1.28,1.20,1.34,1.28,,,
55,1.77,2.10,1.47,1.41,,,
56,1.45,1.55,2.53,1.52,,,
...,...,...,...,...,...,...,...
305,,,,,1.30,2.00,1.45
306,,,,,1.32,1.73,1.56
307,,,,,1.15,1.28,1.39
308,,,,,1.73,1.54,1.32


In [139]:
class_waves_pal.describe()

Wave,1,2,3,4,5,6,7
count,94.0,94.0,94.0,83.0,60.0,44.0,44.0
mean,1.41,1.53,1.6,1.44,1.33,1.61,1.46
std,0.29,0.3,0.36,0.39,0.2,0.25,0.19
min,0.74,0.9,0.82,0.49,0.74,1.19,1.1
25%,1.25,1.34,1.38,1.23,1.21,1.46,1.32
50%,1.38,1.53,1.53,1.42,1.33,1.55,1.43
75%,1.53,1.73,1.77,1.59,1.48,1.78,1.56
max,2.73,2.64,2.74,3.32,1.73,2.12,2.01


In [128]:
# Number of null classes per wave
class_waves_pal[1].isna().sum(),class_waves_pal[2].isna().sum() ,class_waves_pal[3].isna().sum() ,class_waves_pal[4].isna().sum() ,class_waves_pal[5].isna().sum() ,class_waves_pal[6].isna().sum() ,class_waves_pal[7].isna().sum() 

(38, 38, 38, 49, 72, 88, 88)

##### Classes where all waves are collected (in terms of PAL) - only 15 classes out of 132

In [131]:
class_waves_pal[~class_waves_pal.isnull().any(axis=1)]

Wave,1,2,3,4,5,6,7
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
73,1.25,1.7,1.39,1.49,1.33,1.4,1.73
74,1.39,1.68,1.62,1.21,0.9,1.19,1.46
78,1.31,1.49,1.62,1.41,1.27,1.5,1.4
81,1.23,1.45,1.41,1.35,1.27,1.47,1.32
82,1.39,1.79,1.76,1.53,1.41,1.48,1.7
86,1.57,1.17,1.77,1.77,1.35,1.68,1.26
100,1.74,1.87,1.81,1.7,1.37,1.54,1.3
103,1.51,1.74,1.62,1.68,1.13,1.26,1.34
124,1.45,1.46,1.93,1.6,1.52,1.65,1.45
125,1.42,1.61,1.43,1.82,1.49,1.66,1.24


### Communication Networks Data

In [160]:
df_comm1 = pd.read_csv('../data/Buzz_W4_primary-schools.csv', sep=';', header=0)
df_comm1 = df_comm1.append(pd.read_csv('../data/Buzz_W4_secondary-schools.csv', sep=';', header=0))
df_comm1['Wave'] = 4
df_comm2 = pd.read_csv('../data/Buzz_W5_primary-schools.csv', sep=';', header=0)
df_comm2 = df_comm2.append(pd.read_csv('../data/Buzz_W5_secondary-schools.csv', sep=';', header=0))
df_comm2['Wave'] = 5
df_comm = df_comm1.append(df_comm2)
# only the inividually sent messages
df_comm = df_comm[df_comm.Type=="my"]
df_comm = df_comm.astype({'Child': 'int32'})
df_comm = df_comm.astype({'Class': 'int32'})
df_comm = df_comm.astype({'Participant': 'float64'})
df_comm.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(41973, 31)

In [161]:
df_comm.columns

Index(['Participant', 'School', 'Class', 'Child', 'Gender', 'Age', 'PI',
       'Date', 'Time', 'UMID', 'Format_text', 'Format_Media', 'Type',
       'RecipientChild', 'ClassReceive', 'Content', 'Media', 'ReactionInto',
       'Flagged', 'PostedWhileBanned', 'ShareSource', 'SharedVia',
       'Number_likes', 'Unnamed: 23', 'Wave', 'Unnamed: 24', 'Unnamed: 25',
       'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29'],
      dtype='object')

In [162]:
df_comm = df_comm.drop([ 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25','Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29'], axis = 1)
df_comm.head(3)

Unnamed: 0,Participant,School,Class,Child,Gender,Age,PI,Date,Time,UMID,...,ClassReceive,Content,Media,ReactionInto,Flagged,PostedWhileBanned,ShareSource,SharedVia,Number_likes,Wave
0,3410302337.0,34.0,103,2337,0.0,10.0,7.0,15-3-2017,10:05,31296,...,1,Hoi,,,0.0,0.0,,,0.0,4
1,3410302337.0,34.0,103,2337,0.0,10.0,7.0,15-3-2017,10:06,31310,...,1,Hoi,,,0.0,0.0,,,0.0,4
10,3410302337.0,34.0,103,2337,0.0,10.0,7.0,15-3-2017,15:46,31673,...,1,Hoi,,,0.0,0.0,,,0.0,4


#### Per class data

In [166]:
class_waves_messages = df_comm.groupby(['Class','Wave'])['UMID'].count().reset_index()
class_waves_messages = class_waves_messages.pivot(index='Class', columns='Wave')['UMID']
class_waves_messages

Wave,4,5
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
55,48.00,
65,22.00,
66,39.00,
67,32.00,
68,37.00,
...,...,...
305,,44.00
306,,140.00
307,,109.00
308,,1592.00


In [167]:
class_waves_messages.describe()

Wave,4,5
count,45.0,44.0
mean,206.58,742.66
std,373.66,1088.46
min,3.0,2.0
25%,26.0,67.25
50%,50.0,292.5
75%,213.0,801.5
max,1799.0,5301.0


In [170]:
class_waves_messages[4].isna().sum(),class_waves_messages[5].isna().sum() 

(29, 30)

In [171]:
# collected messages at both waves 4 and 5
class_waves_messages[~class_waves_messages.isnull().any(axis=1)]

Wave,4,5
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
73,43.0,4.0
74,319.0,2.0
78,1799.0,241.0
81,800.0,159.0
82,165.0,33.0
86,50.0,62.0
100,770.0,581.0
103,280.0,189.0
124,391.0,363.0
125,1600.0,565.0


#### Per Child Data

In [173]:
child_waves_messages = df_comm.groupby(['Child','Wave'])['UMID'].count().reset_index()
child_waves_messages = child_waves_messages.pivot(index='Child', columns='Wave')['UMID']
child_waves_messages

Wave,4,5
Child,Unnamed: 1_level_1,Unnamed: 2_level_1
908,2.00,
949,23.00,
992,4.00,
997,15.00,
998,2.00,
...,...,...
6178,,11.00
6179,,2.00
6180,,8.00
6181,,5.00


In [177]:
child_waves_messages.describe()

Wave,4,5
count,446.0,617.0
mean,20.84,52.96
std,38.16,96.56
min,1.0,1.0
25%,3.0,5.0
50%,7.0,17.0
75%,20.0,50.0
max,331.0,789.0


In [183]:
child_waves_messages[4].isna().sum(),child_waves_messages[5].isna().sum() 

(474, 303)

In [179]:
child_waves_messages[~child_waves_messages.isnull().any(axis=1)]

Wave,4,5
Child,Unnamed: 1_level_1,Unnamed: 2_level_1
1409,14.00,4.00
1605,131.00,5.00
1606,9.00,2.00
1607,7.00,10.00
1608,14.00,16.00
...,...,...
4877,20.00,3.00
4880,4.00,4.00
4881,21.00,4.00
4884,10.00,1.00


### Background Information Data

In [182]:
df_pp = pd.read_csv('../data/Participant_info_Final.csv', sep=';', header=0)
df_pp.head(3)

Unnamed: 0,School,Level,Class_Y1,Class_Y2,Class_Y3,Child,Sex,Sex_f,Age_W1,Age_W5,W1,W2,W3,W4,W5,W6,W7,Y1_Sample,Y3_Sample
0,22,Secondary,52.0,52.0,,902,1,Female,13.0,,1,1,0,0,0,0,0,1,0
1,22,Secondary,52.0,52.0,,904,1,Female,12.0,,1,1,0,0,0,0,0,1,0
2,22,Secondary,52.0,52.0,,907,1,Female,12.0,,1,1,0,0,0,0,0,1,0


In [193]:
df_pp.shape

(1484, 19)

In [196]:
df_pp['Y1_Sample'].sum(),df_pp['Y3_Sample'].sum()

(951, 1032)

In [192]:
df_pp['W1'].sum(),df_pp['W2'].sum(),df_pp['W3'].sum(),df_pp['W4'].sum(),df_pp['W5'].sum(),df_pp['W6'].sum(),df_pp['W7'].sum()

(843, 901, 868, 744, 1017, 755, 745)

In [214]:
df_all_waves = df_pp[(df_pp['W1']==1) & (df_pp['W2']==1)& (df_pp['W3']==1) & (df_pp['W4']==1) &(df_pp['W5']==1) & (df_pp['W6']==1) & (df_pp['W7']==1)]

In [215]:
df_all_waves.groupby(['Class_Y1','Class_Y2','Class_Y3'])['Child'].count()

Class_Y1  Class_Y2  Class_Y3
74.00     74.00     74.00        8
78.00     78.00     78.00       19
79.00     79.00     78.00        5
81.00     81.00     81.00       27
82.00     82.00     82.00        6
86.00     86.00     86.00        8
100.00    100.00    100.00      11
                    300.00       1
103.00    103.00    103.00      13
124.00    124.00    124.00      15
125.00    125.00    125.00      11
131.00    131.00    131.00      10
133.00    133.00    133.00      10
135.00    135.00    135.00      17
141.00    141.00    141.00      13
Name: Child, dtype: int64

In [206]:
df_all_waves.groupby(['Class_Y2'])['Child'].count()

Class_Y2
74.00      8
78.00     19
79.00      5
81.00     27
82.00      6
86.00      8
100.00    12
103.00    13
124.00    15
125.00    11
131.00    10
133.00    10
135.00    17
141.00    13
Name: Child, dtype: int64

In [211]:
# pd.set_option('display.max_rows', 200)

# df_pp.groupby(['Class_Y1','Class_Y2','Class_Y3'])['Child'].count().reset_index()

## Participants per class - Wave5 nominations vs communication data

In [493]:
df_pp = pd.read_csv('../data/Participant_info_Final.csv', sep=';', header=0)
df_pp = df_pp[df_pp.W5 == 1]
df_pp.head(3)

Unnamed: 0,School,Level,Class_Y1,Class_Y2,Class_Y3,Child,Sex,Sex_f,Age_W1,Age_W5,W1,W2,W3,W4,W5,W6,W7,Y1_Sample,Y3_Sample
32,23,Secondary,59.0,59.0,231.0,1037,1,Female,12.0,14.0,1,1,1,1,1,0,0,1,1
34,23,Secondary,59.0,59.0,311.0,1046,1,Female,13.0,15.0,1,1,1,1,1,0,0,1,1
35,23,Secondary,59.0,59.0,311.0,1047,0,Male,13.0,15.0,1,1,1,1,1,0,0,1,1


In [494]:
num_part = df_pp.groupby('Class_Y3')['Child'].nunique().reset_index(name="PP_NumChild")
num_part.head(3)

Unnamed: 0,Class_Y3,PP_NumChild
0,65.0,9
1,66.0,12
2,67.0,17


##### Nomination Data

In [600]:
nomm_data = '../data/W5_Sociometric_long.csv'
df_nomm = pd.read_csv(nomm_data, sep=';', header=0)
df_nomm.head(3)

Unnamed: 0,School,Class,Child,Gender,Age,PI,Wave,Variable,Alter
0,26,73,1401,1,12,0,5,GEN_Advice,1397
1,26,73,1401,1,12,0,5,GEN_Advice,1398
2,26,73,1401,1,12,0,5,GEN_Advice,1399


In [496]:
df_nomm['Child'].nunique(),df_nomm['Alter'].nunique()

(736, 1040)

In [497]:
# number of children nominating someone per class
df_nom_part = df_nomm.groupby('Class')['Child'].nunique().reset_index(name="NumChildren")

In [498]:
df_number_nominations = df_nomm.groupby(['Class','Child'])['Variable'].count().reset_index(name="NumNominations")
df_number_nominations.head(3)

Unnamed: 0,Class,Child,NumNominations
0,73,1401,28
1,73,1402,11
2,73,1406,5


In [499]:
df_number_nominations.groupby('Class')['NumNominations'].sum().reset_index(name="NominationsPerClass").head(3)

Unnamed: 0,Class,NominationsPerClass
0,73,88
1,74,200
2,78,549


##### Communication data

In [500]:
# Load data
primary_school = '../data/Buzz_W5_primary-schools.csv'
secondary_school = '../data/Buzz_W5_secondary-schools.csv'
# Preprocess social buzz dataframe
df_comm = pd.read_csv(primary_school, sep=';', header=0)
df_comm = df_comm.append(pd.read_csv(secondary_school, sep=';', header=0))
df_comm = df_comm[df_comm.Type=="my"]
f_comm = df_comm.astype({'Participant': 'float64'})
df_comm.head(3)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Participant,School,Class,Child,Gender,Age,PI,Date,Time,UMID,...,ShareSource,SharedVia,Number_likes,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
0,3410302337,34.0,103.0,2337.0,0.0,11.0,7.0,9-3-2018,12:23,102371,...,,,0,,,,,,,
15,3410302337,34.0,103.0,2337.0,0.0,11.0,7.0,10-3-2018,14:50,107068,...,,,0,,,,,,,
16,3410302337,34.0,103.0,2337.0,0.0,11.0,7.0,10-3-2018,14:50,107069,...,,,0,,,,,,,


In [501]:
df_comm_child_class = df_comm.groupby('Class')['Child'].nunique().reset_index(name="NumChildren")
df_comm_child_class.head(3)

Unnamed: 0,Class,NumChildren
0,73.0,1
1,74.0,1
2,78.0,18


#### gather the information from the three different sources in one dataframe

In [502]:
num_part['Nomination_NumChild'] = num_part['Class_Y3'].map(df_nom_part.set_index('Class')['NumChildren'])
num_part['Communication_NumChild'] = num_part['Class_Y3'].map(df_comm_child_class.set_index('Class')['NumChildren'])
num_part.head(3)

Unnamed: 0,Class_Y3,PP_NumChild,Nomination_NumChild,Communication_NumChild
0,65.0,9,,
1,66.0,12,,
2,67.0,17,,


In [503]:
df_class_selection = num_part[num_part.Communication_NumChild>14]
df_class_selection

Unnamed: 0,Class_Y3,PP_NumChild,Nomination_NumChild,Communication_NumChild
11,78.0,29,29.0,18.0
12,81.0,28,26.0,19.0
15,100.0,20,20.0,19.0
17,124.0,19,19.0,17.0
18,125.0,17,17.0,16.0
34,256.0,24,24.0,24.0
35,258.0,25,25.0,16.0
37,261.0,17,17.0,17.0
38,262.0,15,15.0,15.0
39,263.0,20,20.0,20.0


In [504]:
df_class_selection['Communication_NumChild'].sum()

411.0

In [522]:
list(df_class_selection['Class_Y3'])

[78.0,
 81.0,
 100.0,
 124.0,
 125.0,
 256.0,
 258.0,
 261.0,
 262.0,
 263.0,
 272.0,
 273.0,
 292.0,
 296.0,
 297.0,
 298.0,
 299.0,
 300.0,
 301.0,
 302.0,
 303.0]

##### Removing non-repeated participants from the dataframes

In [505]:
classes = list(df_class_selection["Class_Y3"])

In [506]:
# subset data based on these classses
df_comm = df_comm[df_comm.Class.isin(classes)]
df_nomm = df_nomm[df_nomm.Class.isin(classes)]

In [507]:
for cl in classes:
    com_set = set(df_comm[df_comm.Class == cl]['Child'].unique())
    nom_set = set(df_nomm[df_nomm.Class == cl]['Child'].unique())

    dif_cld_nom_com = list(nom_set.difference(com_set))
    dif_cld_com_nom = list(com_set.difference(nom_set))
    #remove the non-repeated participants
    if(len(dif_cld_nom_com)>0):
        df_nomm = df_nomm[~df_nomm.Child.isin(dif_cld_nom_com)]
    
    if(len(dif_cld_com_nom)>0):
        df_comm = df_comm[~df_comm.Child.isin(dif_cld_com_nom)]

In [508]:
set(df_nomm.Child.unique()).difference(set(df_comm.Child.unique())), set(df_comm.Child.unique()).difference(set(df_nomm.Child.unique())),df_comm.Child.nunique()

(set(), set(), 408)

In [509]:
lst_participants_id = df_nomm.Child.unique()

### Postal codes livability as new environment score? 

In [329]:
# Address
df_postcode = pd.read_csv('../data/postcode/pc6hnr20200801_gwb.csv',';')
df_postcode = df_postcode[['PC6','Buurt2020']]
df_postcode.head(2)

Unnamed: 0,PC6,Buurt2020
0,1011AB,3630400
1,1011AB,3630400


In [330]:
df_liveability_score = pd.read_excel('../data/postcode/Score_Buurt.xlsx')
df_liveability_score['CODE'] = df_liveability_score['CODE'].str[2:]
df_liveability_score['CODE'] = df_liveability_score['CODE'].str.lstrip("0")
df_liveability_score.CODE = pd.to_numeric(df_liveability_score.CODE, errors='coerce').astype('Int64')

In [331]:
df_postcode['livability'] = df_postcode['Buurt2020'].map(df_liveability_score.set_index('CODE')['KL18'])
df_postcode.head()

Unnamed: 0,PC6,Buurt2020,livability
0,1011AB,3630400,9.0
1,1011AB,3630400,9.0
2,1011AB,3630400,9.0
3,1011AB,3630400,9.0
4,1011AB,3630400,9.0


In [332]:
df_postcode['livability'].count(),df_postcode['livability'].isna().sum()

(6993336, 775318)

In [333]:
df_postcode = df_postcode.groupby(['PC6','livability'])['Buurt2020'].count().reset_index(name="Count")

In [334]:
df_postcode = df_postcode[['PC6','livability']]
df_postcode.head(3)

Unnamed: 0,PC6,livability
0,1011AB,9.0
1,1011AC,9.0
2,1011AD,9.0


In [335]:
df_postcode.shape,df_postcode['livability'].count(),df_postcode['livability'].isna().sum()

((422531, 2), 422531, 0)

In [336]:
df_postcode["PC6"] = df_postcode["PC6"].astype("string")

In [337]:
df_participants_postcodes = pd.read_csv('../data/postcode/Postcodes_participants.csv',';')
df_participants_postcodes = df_participants_postcodes[['Participant','Zipcode']]
df_participants_postcodes["Zipcode"] = df_participants_postcodes["Zipcode"].astype("string")
df_participants_postcodes["Zipcode"] = df_participants_postcodes["Zipcode"].str.replace(" ", "")
df_participants_postcodes.head(3)

Unnamed: 0,Participant,Zipcode
0,902,6843GT
1,904,6826BZ
2,907,6714HR


In [338]:
df_participants_postcodes['Zipcode'].nunique(),df_participants_postcodes.shape

(951, (1136, 2))

In [339]:
df_participants_postcodes['env'] = -1
for index, row in df_participants_postcodes.iterrows():
    if(len(df_postcode[df_postcode.PC6==row['Zipcode']]['livability'])!=0):
        df_participants_postcodes['env'].iloc[index] = df_postcode[df_postcode.PC6==row['Zipcode']]['livability'].values[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [340]:
# 43 participants have no env value
df_participants_postcodes.groupby('env')['Zipcode'].count()

env
-1.00     43
3.00       4
4.00      97
5.00      69
6.00     340
7.00     209
8.00     138
9.00     236
Name: Zipcode, dtype: int64

In [342]:
df_participants_postcodes.head()

Unnamed: 0,Participant,Zipcode,env
0,902,6843GT,6.0
1,904,6826BZ,5.0
2,907,6714HR,6.0
3,908,6822CR,4.0
4,909,6822DM,4.0


In [453]:
df_participants_postcodes.shape,df_participants_postcodes.dtypes,type(lst_participants_id[0])

((1136, 3),
 Participant      int64
 Zipcode         string
 env            float64
 dtype: object,
 numpy.int64)

In [None]:
# lots of the participants we need have no postcodes data- therefore cannot get the environmental variable

In [451]:
df_participants_postcodes[df_participants_postcodes.Participant.isin(lst_participants_id)].shape,len(lst_participants_id)

((237, 3), 408)

### Let's try the environmental questions in Wave 6

In [565]:
df_env = pd.read_csv('../data/W6_Main_Questions.csv',';')

In [566]:
df_env.columns

Index(['Child', 'DI_FFQ_drinks_D01_TriggerDate', 'DI_FFQ_drinks_D01_Weekend',
       'DI_FFQ_drinks_D01_TriggerTime', 'DI_FFQ_drinks_D01_VAS_thirst_A01',
       'DI_FFQ_drinks_D01_water_A01', 'DI_FFQ_drinks_D01_milk_A01',
       'DI_FFQ_drinks_D01_lightmilk_A01', 'DI_FFQ_drinks_D01_sweetmilk_A01',
       'DI_FFQ_drinks_D01_coffeetea_A01',
       ...
       'Vlog_boysvsgirls_heuvel_TriggerDate',
       'Vlog_boysvsgirls_heuvel_Weekend',
       'Vlog_boysvsgirls_heuvel_TriggerTime',
       'Vlog_boysvsgirls_heuvel_exposed_A01',
       'Vlog_boysvsgirls_heuvel_attitude_A01',
       'Vlog_coutchpotato_heuvel_TriggerDate',
       'Vlog_coutchpotato_heuvel_Weekend',
       'Vlog_coutchpotato_heuvel_TriggerTime',
       'Vlog_coutchpotato_heuvel_exposed_A01',
       'Vlog_coutchpotato_heuvel_attitude_A01'],
      dtype='object', length=524)

In [567]:
df_env = df_env[['Child','GEN_FAS_computer_A01' ,'GEN_FAS_car_A01', 'GEN_FAS_vacation_A01', 'GEN_FAS_ownroom_A01']]
df_env.shape

(1484, 5)

In [568]:
# this seems like a more valiable solution
df_env = df_env[df_env.Child.isin(lst_participants_id)]
df_env.isnull().sum()

Child                    0
GEN_FAS_computer_A01    41
GEN_FAS_car_A01         41
GEN_FAS_vacation_A01    41
GEN_FAS_ownroom_A01     41
dtype: int64

## Building Communication Networks

In [530]:
# Load data
primary_school = '../data/Buzz_W5_primary-schools.csv'
secondary_school = '../data/Buzz_W5_secondary-schools.csv'
# Selection of classes based on year 3 of the project (wave 5)

# Preprocess social buzz dataframe
df_comm = pd.read_csv(primary_school, sep=';', header=0)
df_comm = df_comm.append(pd.read_csv(secondary_school, sep=';', header=0))
df_comm = df_comm[df_comm.Type=="my"]
df_comm = df_comm[df_comm.Child.isin(input_args["participants"])]
df_comm = df_comm.astype({'Participant': 'float64'})
df_comm.shape

(26051, 30)

In [531]:
# needed for the mapping - at least for now 
df_participants = df_comm.groupby(['Class','Participant','Child'])['UMID'].count().reset_index()
df_participants = df_participants[['Class','Participant','Child']]
df_participants.head()

Unnamed: 0,Class,Participant,Child
0,78.0,2807801633.0,1633.0
1,78.0,2807801636.0,1636.0
2,78.0,2807801637.0,1637.0
3,78.0,2807801639.0,1639.0
4,78.0,2807801640.0,1640.0


In [534]:
def generate_buzz_population(df_comm):
    graph = nx.DiGraph()
    
    #create the connections - edges
    for cl in classes:
        # the weight is according to version 1 
        df_temp = df_comm[df_comm.Class.isin([cl])]
        df_temp = df_temp.groupby(["Participant","RecipientChild"])["UMID"].count().reset_index(name='NumberMessages')
        df_temp['Weight'] = df_temp['NumberMessages']/max(df_temp['NumberMessages'])
        children_in_class = df_temp.Participant.unique()
        
        for index, row in df_temp.iterrows():
            #only if the recepient is member of the same class - excluded intraclass communication
            # this is not 100% ok, I need a list of all participants per class!
            if(row[1] in children_in_class):
                graph.add_edge(df_participants.loc[df_participants['Participant'] == row[0], 'Child'].iloc[0],df_participants.loc[df_participants['Participant'] == row[1], 'Child'].iloc[0],weight=row[3])
    
        
    return graph        

In [533]:
graph = generate_buzz_population(df_comm)

In [536]:
# seems like two nodes are not here, even though they were found in the communication data
len(graph.nodes()),len(graph.edges())

(406, 3122)

In [542]:
df_participants.shape

(408, 3)

In [551]:
set(df_comm.RecipientChild.unique()).difference(df_comm.Participant.unique())

{2807801634.0,
 2807801644.0,
 2807801648.0,
 2807801649.0,
 2807801655.0,
 2807804363.0,
 2908101613.0,
 2908101616.0,
 2908101619.0,
 2908101621.0,
 2908101622.0,
 2908101628.0,
 2908101629.0,
 2908104362.0,
 2927205441.0,
 2927205442.0,
 2927305477.0,
 3310002272.0,
 3330005987.0,
 3330005998.0,
 3330106014.0,
 3812402876.0,
 3912502892.0,
 5425605026.0,
 5425605030.0,
 5425805077.0,
 5425805082.0,
 5425805090.0,
 5425805097.0,
 5629605877.0,
 5629605888.0,
 5629705891.0,
 5830306061.0,
 5830306075.0,
 5830306083.0,
 5830306084.0}

In [545]:
df_comm.columns

Index(['Participant', 'School', 'Class', 'Child', 'Gender', 'Age', 'PI',
       'Date', 'Time', 'UMID', 'Format_text', 'Format_Media', 'Type',
       'RecipientChild', 'ClassReceive', 'Content', 'Media', 'ReactionInto',
       'Flagged', 'PostedWhileBanned', 'ShareSource', 'SharedVia',
       'Number_likes', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
       'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29'],
      dtype='object')