In [6]:
# Q3.1  Subsection: Add social network interference score
import math
from igraph import *
import pandas as pd
import collections


## Build the social network graph for students in each school.

The intent is to understand how treatment will affect students through their social networks. After processing, we could calculate out the trained friends number in their 1-jump and 2-jump social network for each student.

In [7]:
df = pd.read_csv('data/anticonflict_study_dataframe.csv')
df = df.assign(TREAT_ID = lambda x: x['TREAT'].str[1])
df['TREAT_ID'] = df['TREAT_ID'].fillna(0)

# Slice the dataframe by school
sch_dfs = []
sch_count = df['SCHID'].max()
for i in range(sch_count):
    sch_dfs.append(df.loc[df['SCHID'] == i+1])
    sch_dfs[i] = sch_dfs[i].reset_index(drop=True)
    print(sch_dfs[i].shape[0])
print(len(sch_dfs))


  df = pd.read_csv('data/anticonflict_study_dataframe.csv')


277
529
138
0
115
467
620
635
394
321
487
565
294
213
103
241
350
446
502
589
444
685
822
839
839
207
289
389
289
558
392
408
384
559
370
603
170
695
445
166
508
441
117
279
440
149
300
319
414
178
551
0
847
0
233
718
279
619
579
661
60


In [8]:
spendtime_noms = ["UID","ST1", "ST2", "ST3", "ST4", "ST5", "ST6", "ST7", "ST8", "ST9", "ST10",'TREAT_ID']
conflict_noms = ["UID","CN1", "CN2", "CN3", "CN4", "CN5",'TREAT_ID']
bestfriend_noms = ["UID","ID","SCHID","BF1", "BF2",'TREAT_ID']

In [9]:
def find_tid_index(df,tid):
    # Given TID, return the index in df dataframe
    
    index = df.index[df['TID'] == tid].tolist()[0]
    return index

def build_edge_list(df, nom_list):
    # Given network dataframe, nomination variables list. 
    # Return the vertices index edge list

    edge_list = []
    valid_id = df['TID'].tolist()
    
    for index, row in df.iterrows():
        for nom_name in nom_list:
            if row[nom_name] in valid_id and not math.isnan(row[nom_name]):
                edge_list.append([find_tid_index(df,row['TID']),find_tid_index(df,int(row[nom_name]))])
    
    return edge_list


def add_schl_bf_neighbor(sch_dfs,sch_idx):
    # Calculate 1-jump and 2-jumps number for students in sch_idx school

    bestfriend_noms = ["UID","ID","SCHID","BF1", "BF2",'TREAT_ID']

    sch_df = sch_dfs[sch_idx]
    bf = sch_df[sch_df.columns.intersection(bestfriend_noms)]
    bf = bf.loc[bf['ID']!=999]
    # TID is the true ID for student ID (Entry errors in Student ID)
    bf['TID'] = bf['UID']-bf['SCHID']*100000

    # Build the edge list. Notice that the vertices in graph is not TID, it should be the vertex index
    nom_list = ["BF1", "BF2"]
    edge_list = build_edge_list(bf,nom_list)


    # Build the undirected graph
    g = Graph(n = bf.shape[0],edges=edge_list,directed=True)
    g.vs['name'] = list(bf['TID'])
    g.vs['tid'] = list(bf['TREAT_ID'])
    g.vs['uid'] = list(bf['UID'])
    g.to_undirected()

    # Calculate the shortest distance from all vertices to the treated vertices 
    target_vertices = g.vs.select(tid_eq="1")
    vs_distance = g.shortest_paths(source=g.vs, target=target_vertices)

    # Return the number of 1-jump and 2-jumps to the new dataframe
    bf1_list = []
    bf2_list = []
    for idx,vertex in enumerate(vs_distance):
        bf1_list.append(collections.Counter(vertex)[1])
        bf2_list.append(collections.Counter(vertex)[2])

    bf['bf_1'] = bf1_list
    bf['bf_2'] = bf2_list

    return bf


    
add_schl_bf_neighbor(sch_dfs,7)

    
    

Unnamed: 0,SCHID,UID,ID,BF1,BF2,TREAT_ID,TID,bf_1,bf_2
0,8,800001,1,22.0,61.0,0,1,0,0
1,8,800002,2,54.0,41.0,0,2,0,0
2,8,800003,3,55.0,9.0,0,3,0,0
3,8,800004,4,60.0,,0,4,1,0
4,8,800005,5,71.0,,0,5,0,0
...,...,...,...,...,...,...,...,...,...
625,8,800640,640,,,0,640,0,0
626,8,800641,641,,,0,641,0,0
627,8,800642,642,,,0,642,0,0
628,8,800643,643,,,0,643,0,0


In [10]:
def add_bf_neighbor(sch_dfs):
    total_df = add_schl_bf_neighbor(sch_dfs,0)
    for i in range(1,len(sch_dfs)):
        total_df = pd.concat([total_df,add_schl_bf_neighbor(sch_dfs,i)],ignore_index=True)
    return total_df

total_df = add_bf_neighbor(sch_dfs)
total_df

Unnamed: 0,SCHID,UID,ID,BF1,BF2,TREAT_ID,TID,bf_1,bf_2
0,1,100001,1,140.0,12.0,0,1,0.0,0.0
1,1,100002,2,21.0,31.0,2,2,0.0,0.0
2,1,100003,3,999.0,72.0,0,3,1.0,0.0
3,1,100004,4,19.0,32.0,0,4,1.0,0.0
4,1,100005,5,258.0,,0,5,0.0,0.0
...,...,...,...,...,...,...,...,...,...
24331,60,6000670,670,,,0,670,0.0,0.0
24332,60,6000671,671,519.0,194.0,0,671,0.0,0.0
24333,60,6000672,672,405.0,481.0,0,672,0.0,0.0
24334,60,6000673,673,,,0,673,0.0,0.0


## Calculate the Social Network Effect Score

$\sum_i bf_i *  1 / 2 ^i$

$bf_i$ is the ith-jump trained friend number. We assume the effect decreasing term for one jump is 0.5. Thus, treated individuals receive 1 "exposure" point, individuals receive 0.5 exposure points for each neighboring student (best friend) who received treatment, and they receive 0.25 points per neighbor's neighbor who received treatment (the best friends of their best friends). No exposure points are assigned for treated individuals who are more than two "jumps" away from a given individual. 

This strategy accounts for interference by 

In [22]:
total_df['bf_0'] = 0
total_df.loc[total_df['TREAT_ID'] == '1', 'bf_0'] = 1
total_df = total_df.assign(Network_Effect = lambda x: (x['bf_0']+x['bf_1']*0.5+x['bf_2']*0.25))
total_df

Unnamed: 0,SCHID,UID,ID,BF1,BF2,TREAT_ID,TID,bf_1,bf_2,bf_0,Network_Effect
0,1,100001,1,140.0,12.0,0,1,0.0,0.0,0,0.0
1,1,100002,2,21.0,31.0,2,2,0.0,0.0,0,0.0
2,1,100003,3,999.0,72.0,0,3,1.0,0.0,0,0.5
3,1,100004,4,19.0,32.0,0,4,1.0,0.0,0,0.5
4,1,100005,5,258.0,,0,5,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
24331,60,6000670,670,,,0,670,0.0,0.0,0,0.0
24332,60,6000671,671,519.0,194.0,0,671,0.0,0.0,0,0.0
24333,60,6000672,672,405.0,481.0,0,672,0.0,0.0,0,0.0
24334,60,6000673,673,,,0,673,0.0,0.0,0,0.0


Merge the network data into the final dataset based on the UID.

In [24]:
dataset = pd.read_csv('data/preprocessed_conflict_data_iter0MICE.csv')
total_df = total_df[['UID','Network_Effect']]
final_df = pd.merge(dataset,total_df)
final_df.to_csv('data/final_student_level_data.csv')