# Assign a friend attribute to a virtual population


Code to create a friend list for irregular contact with a virtual population endowed with education and economic attributes.


Need "synthetic_population_info.csv"

---------
 
Time required: 25 minutes total

# declare Package, read CSV file

1. synthetic_population_info.csv : data with education+economic information created by synthetic_population_info-final

In [1]:
using Printf
using Plots
using CSV
using DataFrames
using StatsBase
using Statistics
using LightGraphs

In [2]:
# read csv file, 51m individuals

df = DataFrame(CSV.File("./synthetic_population_info.csv"))
first(df, 5)

Row,house_id,region_code,age,pre,elementary,junior,high,economic_region,office,person_id
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,1,168,63,0,0,0,0,168,3584990,1
2,1,168,85,0,0,0,0,0,0,2
3,2,168,63,0,0,0,0,168,3578518,3
4,2,168,85,0,0,0,0,0,0,4
5,3,168,63,0,0,0,0,168,3568330,5


In [3]:
# dataframe[!, :"column_name"] = values

df.person_id .= 1:nrow(df)
df[!, :"friends"] .= [Vector{Int64}()] #[Array{Int64,1}()]
df[!, :"group"] .= 0   # adults group (region + age group (10y))
df[!, :"group2"] .= 0  # minors group1  (region + age at the same institution)
df[!, :"group3"] .= 0  # minors group2 (region + school (10 classes))

show(df[1:5,:], allcols=true)

[1m5×14 DataFrame[0m
[1m Row [0m│[1m house_id [0m[1m region_code [0m[1m age   [0m[1m pre   [0m[1m elementary [0m[1m junior [0m[1m high  [0m[1m economic_region [0m[1m office  [0m[1m person_id [0m[1m friends [0m[1m group [0m[1m group2 [0m[1m group3 [0m
     │[90m Int64    [0m[90m Int64       [0m[90m Int64 [0m[90m Int64 [0m[90m Int64      [0m[90m Int64  [0m[90m Int64 [0m[90m Int64           [0m[90m Int64   [0m[90m Int64     [0m[90m Array…  [0m[90m Int64 [0m[90m Int64  [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │        1          168     63      0           0       0      0              168  3584990          1  Int64[]      0       0       0
   2 │        1          168     85      0           0       0      0                0        0          2  Int64[]      0       0       0
   3 │        2          168     

# Make Friends' List

In [4]:
function Make_Barabasi_Networks(n_, id_, k_, n0_)

    # k_: initial link # (edge = link)    : k =< n0 (edge < node)
    # n0_: initial node # (vertex = node)  
    g0 = cycle_graph(n0_)
    
    # Barabasi Albert Network
    g = Matrix(adjacency_matrix(barabasi_albert!(g0, n_, k_)))
    rows, cols = size(g)

    g2 = [];
    for i in 1:rows
        push!(g2, findall(g[i, :] .== 1))
    end
    
    # Replace index with person id
    g3 = deepcopy(g2)
    for i in 1:length(g2)
        cols = length(g2[i])
        for j in 1:cols
            g3[i][j] = id_[g2[i][j]]
        end
    end
    
    return g3    
end

Make_Barabasi_Networks (generic function with 1 method)

In [5]:
# start with 90% randomized connections when first starting up
# create a network where the average value of link is n_*0.9

function Make_Barabasi_Networks_p09(n_, id_)
    
    # k_: initial link # (edge = link)    : k =< n0 (edge < node) : Number of links to increase
    # n0_: initial node # (vertex = node) 
    
    k_ = ceil(Int64, n_*0.9/2)
    n0 = k_
    k0 = ceil(Int64, n0*(n0-1)/2*0.9)
    
    g0 = SimpleGraph(n0, k0)
    g = Matrix(adjacency_matrix(barabasi_albert!(g0, n_, k_)))
    rows, cols = size(g)
    
    g2 = [];
    for i in 1:rows
        push!(g2, findall(g[i, :] .== 1))
    end
    
    # Replace index with person id
    g3 = deepcopy(g2)
    for i in 1:length(g2)
        cols = length(g2[i])
        for j in 1:cols
            g3[i][j] = id_[g2[i][j]]
        end
    end
    
    return g3    
end

Make_Barabasi_Networks_p09 (generic function with 1 method)

## 19 and older.

Put the 19-year-olds in the 20s group.
1. group people by 200. (region + age 10 years)
2. configure barabasi network. (scale free network) : N = 200, node = 15, link = 10

In [6]:
# Create an age_group by grouping ages into 10-year intervals.
# This step takes longer than you might think

df = transform(df, :age => ByRow(x -> searchsortedlast(vcat(0:10:90, 1000), x)) => :age_group);
show(df[1:2,:], allcols=true)

[1m2×15 DataFrame[0m
[1m Row [0m│[1m house_id [0m[1m region_code [0m[1m age   [0m[1m pre   [0m[1m elementary [0m[1m junior [0m[1m high  [0m[1m economic_region [0m[1m office  [0m[1m person_id [0m[1m friends [0m[1m group [0m[1m group2 [0m[1m group3 [0m[1m age_group [0m
     │[90m Int64    [0m[90m Int64       [0m[90m Int64 [0m[90m Int64 [0m[90m Int64      [0m[90m Int64  [0m[90m Int64 [0m[90m Int64           [0m[90m Int64   [0m[90m Int64     [0m[90m Array…  [0m[90m Int64 [0m[90m Int64  [0m[90m Int64  [0m[90m Int64     [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │        1          168     63      0           0       0      0              168  3584990          1  Int64[]      0       0       0          7
   2 │        1          168     85      0           0       0      0                0        0          2  Int

In [7]:
# Put 19-year-olds in the 20-29 age group.

df[df[(df.age .== 19), :person_id], :age_group] .= 3
println(" ")

 


In [8]:
# group 200 people by age group (10y) + region.

# After grouping by region + age, divide by 200 to get the number of groups needed.
# Group the population by assigning each person a random number from 1 to the number of groups.
# Minors are given group = 0. (Minors will be grouped by the age at the same institution)


max_group = 200
n_max = 0

# Let's start by grouping them by region and age.
A = groupby(df, [:region_code, :age_group])
count = length(A)

for i in 1:count
    df_tmp = A[i]

    if df_tmp[1, :age_group] >= 3
        n = nrow(df_tmp)
        n_group = n ÷ max_group
        n_group = (n_group < 1 ? 1 : n_group)

        df_tmp[:, :group] .= (rand((1:n_group), n) .+ n_max)
        n_max = maximum(df_tmp[:, :group])
    end
    
    df_tmp = nothing
end

A = nothing

In [9]:
# Create a list of adults and friends
df[!, :"order"] .= 0

A = groupby(df, :group)
count = length(A)

#  Reason for starting i at 2: i = 1 means group = 0 (this group is minors)
for i in 2:count    
    df_tmp = A[i]
    n = nrow(A[i])
    id0 = df_tmp[:, :person_id]
    
    # shuffle
    df_tmp[:, :order] .= rand((1:2*n), n)
    df_tmp = sort(df_tmp, [:order, :office])
    id = df_tmp[:, :person_id]
    
    if n != sum(id0 .== sort(id))
        println(" ERROR **")
    end
  
    # Generate Network
    list = Make_Barabasi_Networks(n, id, 10, 15)

    # Save to the friends list
    df[id, :friends] .= list

    df_tmp = nothing
end

select!(df, Not(:order))
A = nothing

총 성인의 network group = 218097


## Under 20 years old (minors).

0-2 year olds have friends X

### Friends not in school + Friends in school
0. Basically the same method as for adults.
1. group them by 200 people. (region + same institution age)
2. organize barabasi network. (scale free network) : N = 200, node = 15, link = 10

### Friends at school
0. Friends made by the above method
1. configure the barabasi network so that most people in the class are friends: 90% of the class is connected.
2. combine 10 classes, then mix 10 classes again. (like a school) : so that the whole is 90% connected.

In [11]:
# Group 200 people together by region + same institutional age.
# Each population was grouped by assigning them a random number from 1 to the number of groups.
# Maximum number of people to be in a group (some fluctuation here, if 247 is the entire population, put them all in one group)

max_group = 200
n_max = 0

# Kindergarten, elementary, junior, and high school
for cols in 1:4    
    if cols == 1 
        n_st = 3; n_fi = 6
    elseif cols == 2
        n_st = 7; n_fi = 12
    elseif cols == 3
        n_st = 13; n_fi = 15        
    elseif cols == 4
        n_st = 16; n_fi = 18
    end
    
    # region
    for rc in 1:250
        id = df[(df.region_code .== rc) .& (n_st .<= df.age .<= n_fi), :person_id]
        n = length(id)
        
        # Generate Network
        if n != 0
            n_group = n ÷ max_group
            n_group = (n_group < 1 ? 1 : n_group)
                
            df[id, :group2] .= (rand((1:n_group), n) .+ n_max)
            n_max = maximum(df[id, :group2])
        end
    end
end

In [12]:
# Create a list of students and friends (Step 1)
# 200 kids, grouped by the same institution

df[!, :"order"] .= 0

A = groupby(df, :group2)
count = length(A)

# Reason for starting i at 2: i = 1 means group = 0
for i in 2:count
    df_tmp = A[i]
    n = nrow(A[i])
    id0 = df_tmp[:, :person_id]
    
    # shuffle
    df_tmp[:, :order] .= rand((1:2*n), n)
    df_tmp = sort(df_tmp, [:order])
    id = df_tmp[:, :person_id]
    
    if n != sum(id0 .== sort(id))
        println(" ERROR **")
    end
  
    # Generate Network
    list = Make_Barabasi_Networks(n, id, 10, 15)

    # Save to the friends list
    df[id, :friends] .= list

    df_tmp = nothing
end

select!(df, Not(:order))
A = nothing

총 학생의 network group = 35050


In [13]:
# Create a list of students and friends (step 2)
# Set up the Barabasi model so that 90% of the total links between nodes in the same class are connected. 


for edu in 4:7
    
    A = groupby(df, edu)
    count = length(A)
    
    # I = 1 means group = 0 (this is not going to school)
    for i in 2:count
        df_tmp = A[i]
        n = nrow(A[i])
        
        # You have to take the teacher out of it
        id0 = df_tmp[(df_tmp.age .<= 18), :person_id]
        n0 = length(id0)
        
        # shuffle
        df_tmp[:, :order] .= rand((1:2*n), n)
        df_tmp = sort(df_tmp, [:order])
        id = df_tmp[(df_tmp.age .<= 18), :person_id]
        
        if n0 != sum(id0 .== sort(id))
            println(" ERROR **")
        end

        # Generate Network
        list = Make_Barabasi_Networks_p09(n0, id)
  
        # Save to the friends list
        append!.(df[id, :friends], list)
 
        df_tmp = nothing
    
    end
    A = nothing
end

edu = 4 : 총 class group = 35401
edu = 5 : 총 class group = 121894
edu = 6 : 총 class group = 54973
edu = 7 : 총 class group = 57840


In [15]:
# Kindergarten,
# Create a list of students and friends (Step 3)
# Create a concept like school (group3 > region + same age) 
# Group 10 classes together, redistribute them, and create a Barabasi model.
# Set up the Barabasi model so that 90% of the total links connecting the nodes are connected after redistribution. 


df[!, :"order"] .= 0
max_group = 10

A = groupby(df, [:region_code, :age])
count = length(A)

cols = 4
for rc in 1:250    
    for an in 3:6
        r, c = size(df[(df.region_code .== rc) .& (df.age .== an), :])
        
        if r != 0
            df_tmp = A[(region_code=rc, age=an,)]
            
            # How many classes are in the region and age.
            n_cls = df_tmp[:, cols]
            n_cls = unique(n_cls)
            n_cls = sort(n_cls)
            n_cls = setdiff(n_cls, [0])
            count = length(n_cls)
            
            if count != 0                     
                # How many schools are there
                n_group = round(Int64, length(n_cls)/max_group) #length(tmp) ÷ max_group
                n_group = (n_group < 1 ? 1 : n_group)
                
                # Number of classes per school
                n_count = round(Int64, count/n_group)
                n_count = (n_count < 1 ? 1 : n_count)
                                
                # Class assignments by school
                for i in 1:n_group
                    cls_min = n_cls[n_count*(i-1)+1]
                    cls_max = n_cls[(n_count*i < count ? n_count*i : count)]
                    if i == n_group
                        cls_max = n_cls[end]
                    end

                   
                    # You have to take the teacher out of it.
                    id = df_tmp[(cls_min .<= df_tmp.pre .<= cls_max) .& (df_tmp.age .<= 18), :person_id]
                    n = length(id)
                    
                    df[id, :group3] .= rand((cls_min:cls_max), n)
                    
                    B = groupby(df[id, :], :group3)
                    nn = length(B)
                    
                    if nn != (cls_max - cls_min + 1)
                        println(" ERROR ??? ", nn, " , n_count = ", n_count," , ??? ", cls_max - cls_min + 1)
                    end
                    
                    for j in 1:nn
                        
                        df_tmp2 = B[j]
                        n_pop = nrow(B[j])
                        id0 = df_tmp2[:, :person_id]
                        
                        # shuffle
                        df_tmp2[:, :order] .= rand((1:2*n_pop), n_pop)
                        df_tmp2 = sort(df_tmp2, [:order])
                        id2 = df_tmp2[:, :person_id]
                        
                        if n_pop != sum(id0 .== sort(id2))
                            println(" ERROR **")
                        end

                        # Generate Network
                        list = Make_Barabasi_Networks_p09(n_pop, id2)

                        # Save to the friends list
                        append!.(df[id2, :friends], list)

                        df_tmp2 = nothing
                    end
                    B = nothing
                    
                    df[id, :group3] .= 0
                end
            end
            df_tmp = nothing
        end
    end
end

select!(df, Not(:order))
A = nothing

In [16]:
# Elementary, junior, and high school : step 3
# Same as the cell above

df[!, :"order"] .= 0
max_group = 10

A = groupby(df, [:region_code, :age])
count = length(A)

for cols in 5:7 
    println(" cols = ", cols)
    
    for rc in 1:250
        
        if cols == 4 
            n_st = 3; n_fi = 6
        elseif cols == 5
            n_st = 7; n_fi = 12
        elseif cols == 6
            n_st = 13; n_fi = 15        
        elseif cols == 7
            n_st = 16; n_fi = 18
        end
        
        for an in n_st:n_fi
            r, c = size(df[(df.region_code .== rc) .& (df.age .== an), :])

            if r != 0
                df_tmp = A[(region_code=rc, age=an,)]

                n_cls = df_tmp[:, cols]
                n_cls = unique(n_cls)
                n_cls = sort(n_cls)
                n_cls = setdiff(n_cls, [0])
                count = length(n_cls)

                if count != 0                     
                    n_group = round(Int64, length(n_cls)/max_group) #length(tmp) ÷ max_group
                    n_group = (n_group < 1 ? 1 : n_group)

                    n_count = round(Int64, count/n_group)
                    n_count = (n_count < 1 ? 1 : n_count)

                    for i in 1:n_group
                        cls_min = n_cls[n_count*(i-1)+1]
                        cls_max = n_cls[(n_count*i < count ? n_count*i : count)]
                        if i == n_group
                            cls_max = n_cls[end]
                        end

                        if cols == 5
                            id = df_tmp[(cls_min .<= df_tmp.elementary .<= cls_max) .& (df_tmp.age .<= 18), :person_id]
                        elseif cols == 6
                            id = df_tmp[(cls_min .<= df_tmp.junior .<= cls_max) .& (df_tmp.age .<= 18), :person_id]
                        elseif cols == 7
                            id = df_tmp[(cls_min .<= df_tmp.high .<= cls_max) .& (df_tmp.age .<= 18), :person_id]
                        end
                        n = length(id)

                        df[id, :group3] .= rand((cls_min:cls_max), n)

                        B = groupby(df[id, :], :group3)
                        nn = length(B)

                        if nn != (cls_max - cls_min + 1)
                            println(" ERROR ??? ", nn, " , n_count = ", n_count," , ??? ", cls_max - cls_min + 1)
                        end

                        for j in 1:nn

                            df_tmp2 = B[j]
                            n_pop = nrow(B[j])
                            id0 = df_tmp2[:, :person_id]

                            df_tmp2[:, :order] .= rand((1:2*n_pop), n_pop)
                            df_tmp2 = sort(df_tmp2, [:order])
                            id2 = df_tmp2[:, :person_id]

                            if n_pop != sum(id0 .== sort(id2))
                                println(" ERROR **")
                            end

                            list = Make_Barabasi_Networks_p09(n_pop, id2)

                            append!.(df[id2, :friends], list)

                            df_tmp2 = nothing
                        end
                        B = nothing

                        df[id, :group3] .= 0
                    end
                end
                df_tmp = nothing
            end
        end
    end
end

select!(df, Not(:order))
select!(df, Not(:group3))
A = nothing

 cols = 5
 cols = 6
 cols = 7


In [17]:
# Copy data just in case

df2 = deepcopy(df)
first(df2, 5)

Row,house_id,region_code,age,pre,elementary,junior,high,economic_region,office,person_id,friends,group,group2,age_group
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Array…,Int64,Int64,Int64
1,1,168,63,0,0,0,0,168,3584990,1,"[7282442, 12949292, 1831311, 42227565, 33986382, 298291, 15960249, 16182219, 30322853, 9218036 … 38196387, 24974722, 31823635, 29915129, 42229493, 12954042, 46531742, 17708656, 14457351, 38176399]",178745,0,7
2,1,168,85,0,0,0,0,0,0,2,"[34039199, 11848888, 16776962, 49845195, 11266856, 32541754, 6874537, 38637919, 50400289, 15932193 … 42775331, 19627264, 15463744, 50182146, 14059364, 44200790, 6874597, 31027310, 42856889, 37677448]",179007,0,9
3,2,168,63,0,0,0,0,168,3578518,3,"[42227541, 1248544, 38849896, 12633486, 48247118, 4992977, 10103496, 1126396, 42995676, 14373582 … 14933077, 12616105, 39658326, 35320913, 22750738, 41336614, 15960302, 25562272, 18879117, 25208925]",178747,0,7
4,2,168,85,0,0,0,0,0,0,4,"[36679308, 50184042, 47822992, 51223638, 23718638, 36325530, 42227743, 27874966, 35842872, 268146 … 3138664, 14058044, 26449168, 14263992, 40069467, 50185458, 29117269, 50184312, 24127201, 14373685]",179001,0,9
5,3,168,63,0,0,0,0,168,3568330,5,"[22861087, 50165189, 44682802, 1248478, 51381860, 29607144, 38816931, 29839811, 20069792, 26994801 … 9953357, 25208929, 1831350, 25193332, 27721211, 37281020, 15305439, 40408353, 42566315, 51380040]",178780,0,7


In [18]:
# This is taking forever
# Students' friends can overlap, so we need to remove the duplicates
# Organize them in ascending order for easier viewing

df2[:, :friends] .= unique.(df2[:, :friends])
df2[:, :friends] .= sort.(df2[:, :friends])

println(" ")

 


In [19]:
df2[!, :"n_friends"] .= length.(df2[:, :friends])
println(" ")

 


In [20]:
select!(df2, Not(:house_id))
select!(df2, Not(:region_code))
select!(df2, Not(:age))
#select!(df2, Not(:person_id))
select!(df2, Not(:pre))
select!(df2, Not(:elementary))
select!(df2, Not(:junior))
select!(df2, Not(:high))
select!(df2, Not(:economic_region))
select!(df2, Not(:office))
select!(df2, Not(:group))
select!(df2, Not(:group2))
select!(df2, Not(:age_group))
println(" ")

 


In [21]:
# Save CSV File

CSV.write("synthetic_population_friends.csv", df2)

show(df2[1:2,:], allcols=true)

[1m2×3 DataFrame[0m
[1m Row [0m│[1m person_id [0m[1m friends                           [0m[1m n_friends [0m
     │[90m Int64     [0m[90m Array…                            [0m[90m Int64     [0m
─────┼─────────────────────────────────────────────────────────
   1 │         1  [298291, 566248, 1831311, 484682…         64
   2 │         2  [6874537, 6874597, 11266856, 118…         23

In [22]:
# Save histogram

n_fri = df2[:, :n_friends]
println(mean(n_fri))
println(maximum(n_fri))

histogram(n_fri, color = "darkgreen", lw = 10, xlabel = "friends #", ylabel = "population", bins = 0:1:maximum(n_fri)+1, width = 0.9, leg = false, normalization = :pdf, size = (600, 450), legned = :false, dpi = 1000, background_color = :transparent, foreground_color=:black)
savefig("./friends.png")

39.1207278852515
181


"/Users/mkchae/Desktop/epid/IBM/Mk_friends.png"