# TIGER/Lines data

We use these to get road names and address details.

https://www.census.gov/geo/maps-data/data/tiger-line.html

You need to know the [county](https://en.wikipedia.org/wiki/County_%28United_States%29) to download.


There are three types of file which are of interest to us:

### Roads

These are long segments of roads, with a name, and very little other information.  Often the same (part of) a line appears twice with different names.

### Edges

These are short segments of roads, alleyways etc. with names (mostly) and address informtion (mostly).  The start/end points of each segment form the natural intersection points of the "street network".  Does not contain ambiguous information.

### AddrFeat

Contains similar data to `Edges` but with more address detail.  I have not found a way to use this extra detail.


## Processing

We have found it most useful to import the data into an `open_cp` Graph container.

In [1]:
import geopandas as gpd
import os, collections, lzma, json, bz2, csv
import numpy as np

import open_cp.network
import open_cp.geometry

import pyproj
proj = pyproj.Proj({"init":"EPSG:2768"})

# San Francisco

### Road data

In [8]:
#tiger_path = os.path.join("/media", "disk", "TIGER Data")
tiger_path = os.path.join("..", "..", "..", "..", "Data", "TIGER Data")
filename = os.path.join(tiger_path, "tl_2016_06075_roads__san_francisco")
sanfran_roads = gpd.GeoDataFrame.from_file(filename)
sanfran_roads = sanfran_roads.to_crs({"init":"EPSG:2768"})
sanfran_roads.head()

Unnamed: 0,FULLNAME,LINEARID,MTFCC,RTTYP,geometry
0,Hwy 101 S Off Rmp,110498938773,S1400,M,LINESTRING (1832211.611075589 640258.304130751...
1,Hwy 101 N on Rmp,110498937425,S1400,M,LINESTRING (1826154.294312381 646875.714636406...
2,Ludlow Aly - No Acc,1103660229533,S1780,M,LINESTRING (1827299.763214375 639261.626965801...
3,25th Ave N,1103666896385,S1400,M,LINESTRING (1825113.024921212 644969.067008104...
4,Willard N,1103689702566,S1400,M,LINESTRING (1827616.018999128 643655.490359187...


In [9]:
all_nodes = []
for geo in sanfran_roads.geometry:
    for pt in geo.coords:
        all_nodes.append(pt)
        
b = open_cp.network.PlanarGraphNodeOneShot(all_nodes)
name_lookup = collections.defaultdict(set)
for geo, name in zip(sanfran_roads.geometry, sanfran_roads.FULLNAME):
    path = list(geo.coords)
    for e in b.add_path(path):
        name_lookup[e].add(name)

b.remove_duplicate_edges()
graph = b.build()

In [10]:
edge_name_lookup = collections.defaultdict(set)
for (v1,v2), names in name_lookup.items():
    e, _ = graph.find_edge(v1, v2)
    edge_name_lookup[e].update(names)
assert list(range(graph.number_edges)) == list(edge_name_lookup)
graph.number_edges

34961

### Duplicate names

In [11]:
duplicates = { e : names for e, names in edge_name_lookup.items() if len(names) > 1}
duplicates[0], duplicates[478]

({'Bay Shore Blvd', 'Hwy 101 S Off Rmp'}, {'Kirkwood Ave', 'Kirkwood Ct'})

### Edge data

In [13]:
#tiger_path = os.path.join("/media", "disk", "TIGER Data")
tiger_path = os.path.join("..", "..", "..", "..", "Data", "TIGER Data")
filename = os.path.join(tiger_path, "tl_2016_06075_edges__san_francisco")
sanfran_edges = gpd.GeoDataFrame.from_file(filename)
sanfran_edges = sanfran_edges.to_crs({"init":"EPSG:2768"})
sanfran_edges.head()

Unnamed: 0,ARTPATH,COUNTYFP,DECKEDROAD,DIVROAD,EXTTYP,FEATCAT,FULLNAME,GCSEFLG,HYDROFLG,LFROMADD,...,STATEFP,TFIDL,TFIDR,TLID,TNIDF,TNIDT,TTYP,ZIPL,ZIPR,geometry
0,,75,,,B,,,N,N,,...,6,258290104.0,258290103.0,635284393,1822366,423213731,,,,LINESTRING (1826382.879046002 636381.349933896...
1,,75,,,B,,,N,N,,...,6,212689366.0,212682954.0,192292313,1816511,1816516,,,,LINESTRING (1828369.084484171 636443.246117267...
2,N,75,N,,N,S,Liebig St,N,N,19.0,...,6,212688818.0,212682951.0,192296942,1821210,1816404,,94112.0,94112.0,LINESTRING (1827552.630380671 635906.235163999...
3,,75,,,N,,,N,N,,...,6,262726237.0,261035014.0,644186113,1823215,1816540,,,,LINESTRING (1828055.451165403 635878.422880249...
4,N,75,N,,N,S,Mount Vernon Ave,N,N,,...,6,212689874.0,212682966.0,192286502,1816727,1816722,,,,LINESTRING (1828638.607432603 636673.755262291...


In [14]:
all_nodes = []
for geo in sanfran_edges.geometry:
    for pt in geo.coords:
        all_nodes.append(pt)
        
b = open_cp.network.PlanarGraphNodeOneShot(all_nodes)
name_lookupe = collections.defaultdict(set)
for index, row in sanfran_edges.iterrows():
    geo = row.geometry
    path = list(geo.coords)
    data = (row.FULLNAME, row.LFROMADD, row.LTOADD, row.RFROMADD, row.RTOADD)
    for e in b.add_path(path):
        name_lookupe[e].add(data)

b.remove_duplicate_edges()
graphe = b.build()

In [15]:
edge_name_lookupe = collections.defaultdict(set)
for (v1,v2), names in name_lookupe.items():
    e, _ = graphe.find_edge(v1, v2)
    edge_name_lookupe[e].update(names)
assert list(range(graphe.number_edges)) == list(edge_name_lookupe)
graphe.number_edges

48915

In [16]:
assert all(len(names)==1 for names in edge_name_lookupe.values())

## Combine data?

The following checks that the name the `edges` dataset gives to each graph edge is at least one of the choices offered by the `roads` dataset.  We take this as the "canonical" name for that edge.

In [17]:
# Cheat and use the builder; turns out all vertices are shared...
graph_to_graphe_vertex_lookup = {v : b._lookup[pt] for v, pt in graph.vertices.items()}

In [18]:
canonical_names_lookup = dict()
for e, (v1,v2) in enumerate(graph.edges):
    ee, _ = graphe.find_edge(graph_to_graphe_vertex_lookup[v1], graph_to_graphe_vertex_lookup[v2])
    addresses = list(edge_name_lookupe[ee])
    assert len(addresses) == 1
    address = addresses[0]
    # `address' is the block name given by the `edges` dataset
    if not address[0] in edge_name_lookup[e]:
        print(e, edge_name_lookup[e], ee, edge_name_lookupe[ee])
    canonical_names_lookup[e] = address

# Library code

In [2]:
import os, sys
sys.path.insert(0, os.path.abspath(".."))

In [3]:
import open_cp.logger
open_cp.logger.log_to_true_stdout("impute")
import impute.tiger_lines

In [4]:
tigerdir = os.path.join("..", "..", "..", "..", "Data", "TIGER Data")
tl = impute.tiger_lines.TigerLines(os.path.join(tigerdir, "tl_2016_06075_roads__san_francisco"),
                             os.path.join(tigerdir, "tl_2016_06075_edges__san_francisco"), proj)

In [5]:
tl.check_null_edges()

In [6]:
tl.name_in_roads_only

[]

In [7]:
tl.roads_graph.number_edges, tl.edges_graph.number_edges

(34961, 48915)

In [11]:
duplicates = { i : names for i, (_, names) in enumerate(tl.merged_names) if len(names) > 1 }
duplicates[38], duplicates[4265]

({'Cesar Chavez', 'Cesar Chavez St'}, {'US Hwy 101', 'Van Ness Ave'})

In [12]:
tl.to_geodataframe().head()

Unnamed: 0,left_add_from,left_add_to,right_add_from,right_add_to,name0,name1,name2,geometry
0,,,,,,,,LINESTRING (1826382.879046002 636381.349933896...
1,,,,,,,,LINESTRING (1828369.084484171 636443.246117267...
2,19.0,99.0,22.0,98.0,Liebig St,,,LINESTRING (1827552.630380671 635906.235163999...
3,,,,,,,,LINESTRING (1828055.451165403 635878.422880249...
4,,,,,,,,LINESTRING (1828040.824543123 635879.060763354...


# Does correlating between points and this data help?

- Obtain the same sort of spelling mistakes
- Lots and lots of problems.
  - Some, I think, will be due to crime events placed at the intersection of streets: it's possible that projecting to
  the closest edge will fail here.

In [12]:
datadir = os.path.join("/media", "disk", "Data")
filename = os.path.join(datadir, "SF_Police_Department_Incidents.csv.bz2")

def gen():
    with bz2.open(filename, "rt", encoding="UTF8") as f:
        reader = csv.reader(f)
        yield from reader

In [13]:
block_to_points = collections.defaultdict(set)
rows = gen()
next(rows)
data = []
for row in rows:
    block = row[8]
    x, y = float(row[9]), float(row[10])
    if np.abs(y-90) < 1e-5:
        continue
    block_to_points[block].add((x, y))
block_to_points = {block : [proj(x,y) for x,y in pts] for block, pts in block_to_points.items()}

In [14]:
data = []
for block in block_to_points:
    for x,y in block_to_points[block]:
        (v1, v2), t = graph.project_point_to_graph(x, y)
        e, _ = graph.find_edge(v1, v2)
        data.append((block, e))

In [15]:
postfix = {" AV":" AVE", "BL":"BLVD"}
change = {" DEBOOM ":" DE BOOM ", " RHODEISLAND ":" RHODE ISLAND ",
         " VANNESS ":" VAN NESS ", " SGTJOHNVYOUNG ":" SERGEANT JOHN V YOUNG ",
         " CHARTEROAK ":" CHARTER OAK ", "LOSPALMOS ":"LOS PALMOS "}

def norm_one(block):
    for f,t in postfix.items():
        if block.endswith(f):
            block = block[:-len(f)] + t
    for f,t in change.items():
        block = block.replace(f, t)
    return block

def normalise(block):
    if " / " in block:
        parts = block.split(" / ")
        return " / ".join(norm_one(p) for p in parts)
    else:
        return norm_one(block)

assert normalise('700 Block of KIRKWOOD AV') == '700 Block of KIRKWOOD AVE'
assert normalise('NEWCOMB AV / MENDELL ST') == 'NEWCOMB AVE / MENDELL ST'

toskip = 9
for block, edge in data:
    block = normalise(block)
    names = {s.upper() for s in edge_name_lookup[edge]}
    if not any(n in block for n in names):
        toskip -= 1
        if toskip < 0:
            print(block, names)
            raise AssertionError()

GREAT HY / LINCOLN WY {'GREAT HWY'}


AssertionError: 