In [2]:
import pandas as pd
import glob
import os
import json
import numpy as np

#### Updated calculation: check per wave the number of potential nominations a participant can receive. The maximum number of nominations is the number of participants per school (Thabo). Approx 10% of nominations should be out of class (according to Thabo), test this here. 

In [148]:
# taking nominations from all waves
path = r'../data/movez_nominations' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))
df_nom = pd.concat((pd.read_csv(f,sep=';', header=0) for f in all_files), ignore_index=True)
df_nom

Unnamed: 0,Child,Wave,Variable,Alter
0,1420,7,GEN_Advice,1422
1,1420,7,GEN_Leader,1429
2,1421,7,GEN_Advice,1428
3,1421,7,GEN_Leader,1425
4,1421,7,GEN_Leader,1429
...,...,...,...,...
199932,2932,3,Di_Modelling_reversed,2927
199933,2932,3,Di_Modelling_reversed,2934
199934,2932,3,Di_Modelling_reversed,2936
199935,2932,3,Di_Modelling_reversed,2938


##### First remove the 'extra' Alter children - who didn't gave consent and were thus not participants

In [149]:
lst_participants = list(set(df_nom['Child'].unique()).intersection(set(df_nom['Alter'].unique())))
len(lst_participants)

1470

In [150]:
df_nom = df_nom[(df_nom.Child.isin(lst_participants)) & (df_nom.Alter.isin(lst_participants)) ]

In [151]:
df_nom['Variable'].nunique(),df_nom['Child'].nunique(),df_nom['Alter'].nunique()

(15, 1463, 1469)

In [152]:
df_pp = pd.read_csv('../data/movez_metadata/Participant_Info.csv', sep=';', header=0)
df_pp.shape

(1484, 19)

In [153]:
df_pp = df_pp[(df_pp.Child.isin(lst_participants))]
df_pp.shape

(1470, 19)

In [154]:
df_pp.head(3)

Unnamed: 0,School,Level,Class_Y1,Class_Y2,Class_Y3,Child,Sex,Sex_f,Age_W1,Age_W5,W1,W2,W3,W4,W5,W6,W7,Y1_Sample,Y3_Sample
0,22,Secondary,52.0,52.0,,902,1,Female,13.0,,1,1,0,0,0,0,0,1,0
1,22,Secondary,52.0,52.0,,904,1,Female,12.0,,1,1,0,0,0,0,0,1,0
2,22,Secondary,52.0,52.0,,907,1,Female,12.0,,1,1,0,0,0,0,0,1,0


In [155]:
# New variables summing the number of participants per class and school
df_pp['num_per_school'] = df_pp.School.map(df_pp.groupby(['School'])['Level'].count())
df_pp['num_per_class_Y1'] = df_pp.Class_Y1.map(df_pp.groupby(['Class_Y1'])['Level'].count())
df_pp['num_per_class_Y2'] = df_pp.Class_Y2.map(df_pp.groupby(['Class_Y2'])['Level'].count())
df_pp['num_per_class_Y3'] = df_pp.Class_Y3.map(df_pp.groupby(['Class_Y3'])['Level'].count())

In [156]:
df_pp.head(10)

Unnamed: 0,School,Level,Class_Y1,Class_Y2,Class_Y3,Child,Sex,Sex_f,Age_W1,Age_W5,...,W4,W5,W6,W7,Y1_Sample,Y3_Sample,num_per_school,num_per_class_Y1,num_per_class_Y2,num_per_class_Y3
0,22,Secondary,52.0,52.0,,902,1,Female,13.0,,...,0,0,0,0,1,0,31,5.0,3.0,
1,22,Secondary,52.0,52.0,,904,1,Female,12.0,,...,0,0,0,0,1,0,31,5.0,3.0,
2,22,Secondary,52.0,52.0,,907,1,Female,12.0,,...,0,0,0,0,1,0,31,5.0,3.0,
3,22,Secondary,52.0,55.0,,908,0,Male,12.0,,...,1,0,0,0,1,0,31,5.0,8.0,
4,22,Secondary,52.0,55.0,,909,0,Male,12.0,,...,1,0,0,0,1,0,31,5.0,8.0,
6,22,Secondary,54.0,54.0,,935,0,Male,13.0,,...,1,0,0,0,1,0,31,6.0,5.0,
7,22,Secondary,54.0,54.0,,936,0,Male,13.0,,...,0,0,0,0,1,0,31,6.0,5.0,
8,22,Secondary,54.0,,,942,1,Female,13.0,,...,1,0,0,0,1,0,31,6.0,,
9,22,Secondary,54.0,54.0,,944,0,Male,13.0,,...,0,0,0,0,1,0,31,6.0,5.0,
10,22,Secondary,54.0,54.0,,947,1,Female,12.0,,...,1,0,0,0,1,0,31,6.0,5.0,


In [157]:
# mapping the necessary columns
df_nom['Child_School'] = df_nom['Child'].map(df_pp.set_index('Child')['School'])
df_nom['Child_School_Participants'] = df_nom['Child'].map(df_pp.set_index('Child')['num_per_school'])
df_nom['Child_Class_Y1'] = df_nom['Child'].map(df_pp.set_index('Child')['Class_Y1'])
df_nom['Child_Class_Y2'] = df_nom['Child'].map(df_pp.set_index('Child')['Class_Y2'])
df_nom['Child_Class_Y3'] = df_nom['Child'].map(df_pp.set_index('Child')['Class_Y3'])
df_nom['Child_num_per_class_Y1'] = df_nom['Child'].map(df_pp.set_index('Child')['num_per_class_Y1'])
df_nom['Child_num_per_class_Y2'] = df_nom['Child'].map(df_pp.set_index('Child')['num_per_class_Y2'])
df_nom['Child_num_per_class_Y3'] = df_nom['Child'].map(df_pp.set_index('Child')['num_per_class_Y3'])

df_nom['Alter_School'] = df_nom['Alter'].map(df_pp.set_index('Child')['School'])
df_nom['Alter_School_Participants'] = df_nom['Alter'].map(df_pp.set_index('Child')['num_per_school'])
df_nom['Alter_Class_Y1'] = df_nom['Alter'].map(df_pp.set_index('Child')['Class_Y1'])
df_nom['Alter_Class_Y2'] = df_nom['Alter'].map(df_pp.set_index('Child')['Class_Y2'])
df_nom['Alter_Class_Y3'] = df_nom['Alter'].map(df_pp.set_index('Child')['Class_Y3'])
df_nom['Alter_num_per_class_Y1'] = df_nom['Alter'].map(df_pp.set_index('Child')['num_per_class_Y1'])
df_nom['Alter_num_per_class_Y2'] = df_nom['Alter'].map(df_pp.set_index('Child')['num_per_class_Y2'])
df_nom['Alter_num_per_class_Y3'] = df_nom['Alter'].map(df_pp.set_index('Child')['num_per_class_Y3'])

df_nom['questions_per_wave'] = df_nom.Wave.map(df_nom.groupby(['Wave'])['Variable'].nunique())

df_nom.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nom['Child_School'] = df_nom['Child'].map(df_pp.set_index('Child')['School'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nom['Child_School_Participants'] = df_nom['Child'].map(df_pp.set_index('Child')['num_per_school'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nom['Child_Class_Y1

Unnamed: 0,Child,Wave,Variable,Alter,Child_School,Child_School_Participants,Child_Class_Y1,Child_Class_Y2,Child_Class_Y3,Child_num_per_class_Y1,...,Child_num_per_class_Y3,Alter_School,Alter_School_Participants,Alter_Class_Y1,Alter_Class_Y2,Alter_Class_Y3,Alter_num_per_class_Y1,Alter_num_per_class_Y2,Alter_num_per_class_Y3,questions_per_wave
0,1420,7,GEN_Advice,1422,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
1,1420,7,GEN_Leader,1429,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
2,1421,7,GEN_Advice,1428,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6


##### Check for same class and same school nominations

In [158]:
(df_nom['Child_School'] == df_nom['Alter_School']).value_counts()

True    135848
dtype: int64

In [159]:
df_nom_nonnan_y1 = df_nom.dropna(subset=['Alter_Class_Y1', 'Child_Class_Y1'])
df_nom_nonnan_y2 = df_nom.dropna(subset=['Alter_Class_Y2', 'Child_Class_Y2'])
df_nom_nonnan_y3 = df_nom.dropna(subset=['Alter_Class_Y3', 'Child_Class_Y3'])

In [160]:
(df_nom_nonnan_y1['Child_Class_Y1'] == df_nom_nonnan_y1['Alter_Class_Y1']).value_counts()

True     99150
False     8874
dtype: int64

In [161]:
(df_nom_nonnan_y2['Child_Class_Y2'] == df_nom_nonnan_y2['Alter_Class_Y2']).value_counts()

True     94295
False    13930
dtype: int64

In [162]:
(df_nom_nonnan_y3['Child_Class_Y3'] == df_nom_nonnan_y3['Alter_Class_Y3']).value_counts()

True     109809
False     16524
dtype: int64

##### Result: about 10% of nominations are coming from out of class. For simplicity of the calculation it might be better to keep only within class influencer calculation.

In [163]:
df_nom = df_nom.rename(columns={"Alter": "Nominee", "Child": "Nominator"})
df_nom

Unnamed: 0,Nominator,Wave,Variable,Nominee,Child_School,Child_School_Participants,Child_Class_Y1,Child_Class_Y2,Child_Class_Y3,Child_num_per_class_Y1,...,Child_num_per_class_Y3,Alter_School,Alter_School_Participants,Alter_Class_Y1,Alter_Class_Y2,Alter_Class_Y3,Alter_num_per_class_Y1,Alter_num_per_class_Y2,Alter_num_per_class_Y3,questions_per_wave
0,1420,7,GEN_Advice,1422,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
1,1420,7,GEN_Leader,1429,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
2,1421,7,GEN_Advice,1428,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
4,1421,7,GEN_Leader,1429,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
6,1421,7,GEN_Social_Facilitation,1427,26,23,74.0,74.0,74.0,12.0,...,13.0,26,23,74.0,74.0,74.0,12.0,14.0,13.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199932,2932,3,Di_Modelling_reversed,2927,39,87,127.0,127.0,127.0,14.0,...,10.0,39,87,127.0,127.0,127.0,14.0,10.0,10.0,15
199933,2932,3,Di_Modelling_reversed,2934,39,87,127.0,127.0,127.0,14.0,...,10.0,39,87,127.0,127.0,127.0,14.0,10.0,10.0,15
199934,2932,3,Di_Modelling_reversed,2936,39,87,127.0,127.0,127.0,14.0,...,10.0,39,87,127.0,127.0,127.0,14.0,10.0,10.0,15
199935,2932,3,Di_Modelling_reversed,2938,39,87,127.0,127.0,127.0,14.0,...,10.0,39,87,127.0,127.0,127.0,14.0,10.0,10.0,15


In [192]:
df_summed_nom = df_nom.groupby(['Nominee','Nominator','Wave'])['Wave'].count().reset_index(name='num_nominations')
df_summed_nom

Unnamed: 0,Nominee,Nominator,Wave,num_nominations
0,902,904,1,1
1,902,907,1,4
2,904,902,1,2
3,904,907,1,4
4,907,902,1,2
...,...,...,...,...
50315,6186,5448,6,1
50316,6186,5448,7,2
50317,6186,5449,6,2
50318,6186,5449,7,1


In [193]:
df_summed_nom['Nominee_id_School'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['School'])
df_summed_nom['Nominee_size_School'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['num_per_school'])
df_summed_nom['Nominee_id_Class_Y1'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['Class_Y1'])
df_summed_nom['Nominee_id_Class_Y2'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['Class_Y2'])
df_summed_nom['Nominee_id_Class_Y3'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['Class_Y3'])
df_summed_nom['Nominee_size_class_Y1'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['num_per_class_Y1'])
df_summed_nom['Nominee_size_class_Y2'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['num_per_class_Y2'])
df_summed_nom['Nominee_size_class_Y3'] = df_summed_nom['Nominee'].map(df_pp.set_index('Child')['num_per_class_Y3'])

df_summed_nom['Nominator_id_School'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['School'])
df_summed_nom['Nominator_size_School'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['num_per_school'])
df_summed_nom['Nominator_id_Class_Y1'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['Class_Y1'])
df_summed_nom['Nominator_id_Class_Y2'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['Class_Y2'])
df_summed_nom['Nominator_id_Class_Y3'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['Class_Y3'])
df_summed_nom['Nominator_size_class_Y1'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['num_per_class_Y1'])
df_summed_nom['Nominator_size_class_Y2'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['num_per_class_Y2'])
df_summed_nom['Nominator_size_class_Y3'] = df_summed_nom['Nominator'].map(df_pp.set_index('Child')['num_per_class_Y3'])

df_summed_nom['questions_per_wave'] = df_summed_nom.Wave.map(df_nom.groupby(['Wave'])['Variable'].nunique())

In [194]:
df_summed_nom['Nominee_size_class'] = np.select([df_summed_nom['Wave'].isin([1,2,3]), df_summed_nom['Wave'].isin([4]),df_summed_nom['Wave'].isin([5,6,7])], [df_summed_nom['Nominee_size_class_Y1'], df_summed_nom['Nominee_size_class_Y2'],df_summed_nom['Nominee_size_class_Y3']])
df_summed_nom['Nominator_size_class'] = np.select([df_summed_nom['Wave'].isin([1,2,3]), df_summed_nom['Wave'].isin([4]),df_summed_nom['Wave'].isin([5,6,7])], [df_summed_nom['Nominator_size_class_Y1'], df_summed_nom['Nominator_size_class_Y2'],df_summed_nom['Nominator_size_class_Y3']])
df_summed_nom = df_summed_nom.drop(columns=['Nominee_size_class_Y1', 'Nominee_size_class_Y2','Nominee_size_class_Y3','Nominator_size_class_Y1','Nominator_size_class_Y2','Nominator_size_class_Y3'])
df_summed_nom

Unnamed: 0,Nominee,Nominator,Wave,num_nominations,Nominee_id_School,Nominee_size_School,Nominee_id_Class_Y1,Nominee_id_Class_Y2,Nominee_id_Class_Y3,Nominator_id_School,Nominator_size_School,Nominator_id_Class_Y1,Nominator_id_Class_Y2,Nominator_id_Class_Y3,questions_per_wave,Nominee_size_class,Nominator_size_class
0,902,904,1,1,22,31,52.0,52.0,,22,31,52.0,52.0,,13,5.0,5.0
1,902,907,1,4,22,31,52.0,52.0,,22,31,52.0,52.0,,13,5.0,5.0
2,904,902,1,2,22,31,52.0,52.0,,22,31,52.0,52.0,,13,5.0,5.0
3,904,907,1,4,22,31,52.0,52.0,,22,31,52.0,52.0,,13,5.0,5.0
4,907,902,1,2,22,31,52.0,52.0,,22,31,52.0,52.0,,13,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50315,6186,5448,6,1,29,81,,,272.0,29,81,,,272.0,6,27.0,27.0
50316,6186,5448,7,2,29,81,,,272.0,29,81,,,272.0,6,27.0,27.0
50317,6186,5449,6,2,29,81,,,272.0,29,81,,,272.0,6,27.0,27.0
50318,6186,5449,7,1,29,81,,,272.0,29,81,,,272.0,6,27.0,27.0


In [196]:
# assing new variables to indicate if a nomination is coming from a same class participants, or not, or it is a NULL
df_summed_nom['sameClassY1'] = np.select([(df_summed_nom['Nominator_id_Class_Y1'].isnull() | df_summed_nom['Nominee_id_Class_Y1'].isnull()),(df_summed_nom.Nominee_id_Class_Y1 == df_summed_nom.Nominator_id_Class_Y1),(df_summed_nom.Nominee_id_Class_Y1 != df_summed_nom.Nominator_id_Class_Y1),],[-1,1,0])
df_summed_nom['sameClassY2'] = np.select([(df_summed_nom['Nominator_id_Class_Y2'].isnull() | df_summed_nom['Nominee_id_Class_Y2'].isnull()),(df_summed_nom.Nominee_id_Class_Y2 == df_summed_nom.Nominator_id_Class_Y2),(df_summed_nom.Nominee_id_Class_Y2 != df_summed_nom.Nominator_id_Class_Y2),],[-1,1,0])
df_summed_nom['sameClassY3'] = np.select([(df_summed_nom['Nominator_id_Class_Y3'].isnull() | df_summed_nom['Nominee_id_Class_Y3'].isnull()),(df_summed_nom.Nominee_id_Class_Y3 == df_summed_nom.Nominator_id_Class_Y3),(df_summed_nom.Nominee_id_Class_Y3 != df_summed_nom.Nominator_id_Class_Y3),],[-1,1,0])

In [197]:
df_summed_nom.groupby('sameClassY1')['Nominee'].count(),df_summed_nom.groupby('sameClassY2')['Nominee'].count(),df_summed_nom.groupby('sameClassY3')['Nominee'].count()

(sameClassY1
 -1    14890
  0     5198
  1    30232
 Name: Nominee, dtype: int64,
 sameClassY2
 -1    14558
  0     6722
  1    29040
 Name: Nominee, dtype: int64,
 sameClassY3
 -1     3203
  0     7573
  1    39544
 Name: Nominee, dtype: int64)

#### Keep only the within class nominations

In [198]:
df_valid_noms = df_summed_nom[(df_summed_nom.sameClassY1 == 1) & (df_summed_nom.sameClassY2 == 1) & (df_summed_nom.sameClassY3 == 1)]

In [199]:
(df_valid_noms['Nominee_size_class'] == df_valid_noms['Nominator_size_class']).value_counts()

True    24767
dtype: int64

In [200]:
df_valid_noms = df_valid_noms.groupby(['Nominee','Wave','questions_per_wave','Nominee_size_class'])['num_nominations'].sum().reset_index(name='nominations_per_wave')

In [201]:
df_valid_noms['total_nominations'] = df_valid_noms.Nominee.map(df_valid_noms.groupby(['Nominee'])['nominations_per_wave'].sum())
df_valid_noms

Unnamed: 0,Nominee,Wave,questions_per_wave,Nominee_size_class,nominations_per_wave,total_nominations
0,1046,1,13,10.0,12,28
1,1046,2,13,10.0,6,28
2,1046,3,15,10.0,4,28
3,1046,4,13,10.0,6,28
4,1047,1,13,10.0,21,39
...,...,...,...,...,...,...
3410,6137,6,6,21.0,8,32
3411,6137,7,6,21.0,4,32
3412,6138,5,6,21.0,35,93
3413,6138,6,6,21.0,35,93


##### Extra: See which questions (nominations) are frequently used by the participants. Thabo suggested to only use the more validated questions which usually the participants answered the most (GEN). There were some self-composed, experimental questions.. So we should take this into consideration when deciding which questions to keep

In [202]:
pd.set_option('display.max_rows', 80)
df_nom.groupby(['Wave','Variable'])['Nominee'].count().reset_index(name='size')

Unnamed: 0,Wave,Variable,size
0,1,DI_Com_Network,1272
1,1,DI_Impression_management,2477
2,1,DI_Modelling,1463
3,1,GEN_Advice,1641
4,1,GEN_Friendship,4274
5,1,GEN_Leader,1210
6,1,GEN_Respect,2876
7,1,GEN_Social_Facilitation,4855
8,1,GEN_Want2B,926
9,1,ME_Com_Network,2045
