# Randomized Twitter data generation

> Note: since the data has been already generated, this notebook does not need to be run again.

This notebook takes as input:
* `'../data/cbsacode.csv'` (list of 50 US cities)
* `'../data/tract_ids.gpkg'` (list of unique census tract IDs for all cities)
Then,
* 5 cities from the `cbsacode.csv` list are chosen at random
* for each of those cities, 1000 geolocated points ("users") are randomly sampled (within the metropolitan are boundaries), and user IDs are randomly allocated - those are the nodes of our social network
* for each of those cities, 2500 unique user pairs are randomly sampled ("mutual followerships") - those are the edges of our social network
* the node and edge files (for all 5 cities at once) are saved to: `./data/twitter_dummy/twitter_users_tract_random.gpkg` and `./data/twitter_dummy/twitter_follower_edges_random.gpkg`

In [1]:
# import libraries
import os
import pandas as pd
import geopandas as gpd
import shapely
import random
random.seed(43)
from itertools import permutations

In [2]:
# make subfolders for results
os.makedirs("../data/twitter_dummy/", exist_ok=True)

In [3]:
# define filepaths
cbsacode_file = '../data/cbsacode.csv'
tract_file = '../data/tract_ids.gpkg'

In [4]:
# define sizes for generated data sets
nr_points = 1000 # number of points (nodes) to sample from each city
nr_edges = 2500 # number of node pairs (edges) to sample from each city 

In [5]:
# define projected CRS for distance computation
proj_crs = "ESRI:102010"

### CBSA set

In [6]:
# read in cbsacodes, randomly pick 5 cities
df_cbsacodes = pd.read_csv(cbsacode_file)
cities_random = random.choices(df_cbsacodes.cbsacode, k = 5)
df_cbsacodes = df_cbsacodes[df_cbsacodes.cbsacode.isin(cities_random)].copy().reset_index(drop=True)

In [7]:
# drop not needed columns
df_cbsacodes = df_cbsacodes[["cbsacode", "name", "full_name", "geometry", "west", "south", "east", "north"]]
# convert text to shapely Polygon
df_cbsacodes["geometry"] = df_cbsacodes.geometry.apply(lambda x: shapely.from_wkt(x))
# convert to geodataframe
gdf_cbsacodes = gpd.GeoDataFrame(df_cbsacodes, crs = "EPSG:4326")
# save to folder
gdf_cbsacodes.to_file("../data/twitter_dummy/cbsacode_random.gpkg")

### Nodes

* randomly sample 1000 points for each of the randomly sampled cities (within the given city boundaries)
* randomly allocate user IDs

In [8]:
cbsacode = []
geoms = []

for city in cities_random:
    # get 1000 sample points from the city geometry
    points = gdf_cbsacodes[gdf_cbsacodes["cbsacode"]==city].geometry.sample_points(
        size = nr_points, 
        rng = 42
        )
    points_exploded = points.explode(index_parts=False).reset_index(drop=True)
    geoms += list(points_exploded)
    cbsacode += [city] * nr_points

In [9]:
gdf_nodes = gpd.GeoDataFrame(
    {
        "cbsacode": cbsacode,
        "geometry": geoms
    },
    crs = gdf_cbsacodes.crs
)

gdf_nodes["user_id"] = gdf_nodes.index
gdf_nodes["lat_home"] = gdf_nodes.apply(lambda x: x.geometry.xy[1][0], axis = 1)
gdf_nodes["lon_home"] = gdf_nodes.apply(lambda x: x.geometry.xy[0][0], axis = 1)

In [10]:
# add home census tract ID (for regressions to be run later)
tract_ids = gpd.read_file(tract_file)
mytree = shapely.strtree.STRtree(geoms=tract_ids.geometry)
q = mytree.query(gdf_nodes.geometry, predicate="within")
# q[0] ...nodes indeces
# q[1] ...tract_id indeces
gdf_nodes["tract_home"] = None
gdf_nodes.loc[q[0], "tract_home"] = list(tract_ids.loc[q[1], "full_geoid"])

### Edges

* Randomly sample 2500 node pairs for each city (user_id1, user_id2)
* Compute distance (on projected crs: "ESRI:102010")

In [11]:
# only for distance computations
gdf_nodes_proj = gdf_nodes.to_crs(proj_crs)

In [12]:
cbsacode = []
user_id1 = []
user_id2 = []
distances = []

for city in cities_random:
    
    # get random user1-user2 pairs for each city
    user_ids_all = list(gdf_nodes[gdf_nodes["cbsacode"]==city].user_id)
    user_ids = random.choices(
        [per for per in permutations(user_ids_all, 2)],
        k = nr_edges
    )

    # compute distances
    user0 = gdf_nodes_proj.loc[[user[0] for user in user_ids]].copy().reset_index(drop=True)
    user1 = gdf_nodes_proj.loc[[user[1] for user in user_ids]].copy().reset_index(drop=True)
    dist = [int(d) for d in user0.distance(user1)]
    
    cbsacode += [city] * nr_edges
    user_id1 += [user[0] for user in user_ids]
    user_id2 += [user[1] for user in user_ids]
    distances += dist

df_edges = pd.DataFrame(
    {
        "cbsacode": cbsacode,
        "user_id1": user_id1,
        "user_id2": user_id2,
        "distance_m": distances
    }
)

df_edges["lat_home1"] = df_edges.apply(lambda x: gdf_nodes.loc[x.user_id1]["lat_home"], axis = 1)
df_edges["lon_home1"] = df_edges.apply(lambda x: gdf_nodes.loc[x.user_id1]["lon_home"], axis = 1)
df_edges["lat_home2"] = df_edges.apply(lambda x: gdf_nodes.loc[x.user_id2]["lat_home"], axis = 1)
df_edges["lon_home2"] = df_edges.apply(lambda x: gdf_nodes.loc[x.user_id2]["lon_home"], axis = 1)

In [13]:
# add social connections as linestrings (just in case)
geoms = df_edges.apply(
    lambda x: shapely.geometry.LineString(
        [
            gdf_nodes.loc[x.user_id1]["geometry"],
            gdf_nodes.loc[x.user_id2]["geometry"]
        ]
    ), 
    axis = 1
)

gdf_edges = gpd.GeoDataFrame(
    df_edges
)
gdf_edges = gdf_edges.set_geometry(geoms)

### Save final outcomes as gpkg (jic) and as csv

In [14]:
gdf_nodes.head(3)

Unnamed: 0,cbsacode,geometry,user_id,lat_home,lon_home,tract_home
0,12420,POINT (-98.26857 30.03673),0,30.036734,-98.268568,14000US48209010803
1,12420,POINT (-98.24462 30.17087),1,30.170874,-98.244619,14000US48209010807
2,12420,POINT (-98.24225 30.08819),2,30.088194,-98.24225,14000US48209010803


In [15]:
gdf_edges.head(3)

Unnamed: 0,cbsacode,user_id1,user_id2,distance_m,lat_home1,lon_home1,lat_home2,lon_home2,geometry
0,12420,792,951,32738,29.911383,-97.33834,30.179204,-97.189035,"LINESTRING (-97.33834 29.91138, -97.18904 30.1..."
1,12420,453,189,25538,30.444387,-97.598463,30.359225,-97.856466,"LINESTRING (-97.59846 30.44439, -97.85647 30.3..."
2,12420,498,271,79553,30.678173,-97.569711,29.975901,-97.747485,"LINESTRING (-97.56971 30.67817, -97.74749 29.9..."


In [16]:
gdf_nodes.to_file("../data/twitter_dummy/twitter_users_tract_random.gpkg", index = False)
gdf_edges.to_file("../data/twitter_dummy/twitter_follower_edges_random.gpkg", index = False)

In [17]:
gdf_nodes.drop(columns = ["geometry"]).to_csv("../data/twitter_dummy/twitter_users_tract_random.csv", index = False)
gdf_edges.drop(columns = ["geometry"]).to_csv("../data/twitter_dummy/twitter_follower_edges_random.csv", index = False)