# 0. Functions

In [None]:
import numpy as np
from shapely.geometry import Point
from shapely.ops import cascaded_union
!pip install geopandas
import geopandas as gpd
import pandas as pd

import folium, folium.plugins
from folium import FeatureGroup, plugins, LayerControl, Map
from folium.plugins import FeatureGroupSubGroup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## Source Code: https://stackoverflow.com/questions/57000606/create-equally-spaced-coordinates-on-a-california-state-map-created-with-basemap
def generate_grid_in_polygon(spacing, polygon):
    ''' This Function generates evenly spaced points within the given GeoDataFrame.
        The parameter 'spacing' defines the distance between the points in coordinate units. '''
    import numpy as np
    from shapely.geometry import Point
    from shapely.ops import cascaded_union
    
    # Convert the GeoDataFrame to a single polygon
    poly_in = cascaded_union([poly for poly in polygon.geometry])

    # Get the bounds of the polygon
    minx, miny, maxx, maxy = poly_in.bounds    
    
    # Now generate the entire grid
    x_coords = list(np.arange(np.floor(minx), int(np.ceil(maxx)), spacing))
    y_coords = list(np.arange(np.floor(miny), int(np.ceil(maxy)), spacing))
    
    grid = [Point(x) for x in zip(np.meshgrid(x_coords, y_coords)[0].flatten(), np.meshgrid(x_coords, y_coords)[1].flatten())]
    
    # Finally only keep the points within the polygon
    list_of_points = [point for point in grid if point.within(poly_in)]

    # Transform into a normal dataframe with fields: geometry, Longitude, Latitude and ID
    list_of_points_gdf = gpd.GeoDataFrame(geometry=list_of_points)
    grid_points = pd.DataFrame(list_of_points_gdf)
    grid_points['Longitude'] = grid_points.geometry.apply(lambda p: p.x)
    grid_points['Latitude'] = grid_points.geometry.apply(lambda p: p.y)
    grid_points['ID'] = [i for i in range(grid_points.shape[0])]

    return grid_points

In [None]:
def geo_scatter_plot(df=None, location=[40.767937,-73.982155], tiles="cartodbpositron"):
  '''This function creates a scatter plot of coordinates in the dataset on a base map located by the given location.
  '''
  base_map = folium.Map(location=location, tiles=tiles, control_scale=True, zoom_start=15)
  for each in grid_points.iterrows():
    folium.CircleMarker([each[1]['Latitude'],each[1]['Longitude']], radius=0.001, color='blue',
                        popup=str('(')+str(each[1]['Longitude'])+', '+str(each[1]['Latitude'])+')', fill_color='#FD8A6C').add_to(base_map)
  print('There are '+str(df.shape[0])+' dots on the map.')
  return base_map

In [None]:
def df_cleaner(df, year=None):
    """This function cleans geo-coded data frames by separating the field "Coordinates" into two fields: "Latitude" and "longitude"
    """
    df_clean = df.dropna(axis=0, subset = ['Coordinates'])
    df_clean['Latitude'] = [eval(i)[0] for i in df_clean['Coordinates']]
    df_clean['Longitude'] = [eval(i)[1] for i in df_clean['Coordinates']]
    print()
    print('Number of available data in', str(year)+':', df_clean.shape)
    print('Proportion of available data in', str(year)+':', round(df_clean.shape[0]/df.shape[0], 3))
    print()
    return df_clean

In [None]:
def assign_cluster(df, grid_points):
  '''This function assigns each dots in the dataset to its closest cluster based on Euclidean distance. 
  '''
  # 1: round coordinates to three decimal places --> in order to reduce the number of calculations in later steps
  df['Latitude_Mag'] = df['Latitude'].apply(lambda x: round(x, 3))
  df['Longitude_Mag'] = df['Longitude'].apply(lambda x: round(x, 3))
  df_temp = df.groupby(['Latitude_Mag', 'Longitude_Mag'])[['RecordId']].count().reset_index()
  df_temp['Cluster'] = [None for i in range(df_temp.shape[0])]

  # 2: calculate the distances of each data point to each cluster centroid;
  #    assign the closest cluster to the data point
  for i in range(df_temp.shape[0]):
    row = df_temp.loc[i,:]
    dist = 100
    c = None
    for j in grid_points.iterrows():
      d = (row['Latitude_Mag'] - j[1]['Latitude'])**2 + (row['Longitude_Mag']-j[1]['Longitude'])**2
      if d < dist:
        dist = d
        c = j[1]['ID']
    df_temp.loc[i, 'Cluster'] = c

  # 3: merge df_temp with df, assign the cluster id to each data point
  df_new = pd.merge(df, df_temp[['Latitude_Mag', 'Longitude_Mag', 'Cluster']], how='left', on=['Latitude_Mag', 'Longitude_Mag'])

  return df_new

In [None]:
def label_cluster(df, grid_points, bins=[100,1000,10000]): 
  '''This function labels the degree of size of each cluster based on the number of data points it has.
  '''
  clusters = df.groupby('Cluster')[['RecordId']].count().reset_index()
  new_grid_points = pd.merge(grid_points, clusters, how='left', left_on='ID', right_on='Cluster')
  
  if len(bins)<2:
    return 'Error: Bins should contain at least two values.'
  new_grid_points['Size'] = [None for i in range(grid_points.shape[0])]
  label = 2
  for i in range(len(bins)):
    s0 = bins[i]
    if i == len(bins)-1:
      new_grid_points.loc[new_grid_points['RecordId']>=s0, 'Size']=label
    else:
      if i == 0:
        new_grid_points.loc[(new_grid_points['RecordId']<s0), 'Size']=label
        label += 1
      s1 = bins[i+1]
      new_grid_points.loc[(new_grid_points['RecordId']<s1)&(new_grid_points['RecordId']>=s0), 'Size']=label
    label += 1

  # clusters with no data points:
  new_grid_points.loc[new_grid_points['Cluster'].isnull(), 'Size']=0.01 
  new_grid_points.loc[new_grid_points['Cluster'].isnull(), 'RecordId']=0

  return new_grid_points

In [None]:
def geo_cluster_plot(df, grid_points, bins=[100,1000,10000], colors=['aliceblue', 'lightblue', 'skyblue', 'mediumblue', 'darkblue'],
                     location=[40.767937,-73.982155], tiles="cartodbpositron", name=None):
  '''returns a map with cluster centroids showing the population density within a certain dataset.
  '''
  import branca
  import branca.colormap as cm
  colormap = cm.LinearColormap(colors=colors, vmin=0.01, vmax=2+len(bins))

  new_grid_points = label_cluster(df, grid_points, bins=bins)
  base_map = folium.Map(location=location, tiles=tiles, control_scale=True, zoom_start=13)
  for each in new_grid_points.iterrows():
    folium.CircleMarker([each[1]['Latitude'],each[1]['Longitude']], radius=2**np.log2(each[1]['Size']), 
                        color=colormap(each[1]['Size']), fill_color=colormap(each[1]['Size']), fill_opacity=1,
                        popup="There are "+str(each[1]['RecordId'])+" residents in this cluster.").add_to(base_map)
  colormap.add_to(base_map)

  if name is not None:
        base_map.save(r'cluster_'+name+'.html')
        print('Map saved.')

  return base_map

In [None]:
def geo_cluster_compare(df_compare, grid_points, bins=[100,1000,10000], colors=['aliceblue','lightblue','skyblue','mediumblue','darkblue'], 
                        location=[40.767937,-73.982155], tiles="cartodbpositron", name=None):
  '''returns a map with cluster centroids showing the population density by time (datasets)
  '''
  if df_compare is None:
    year_list = [1940,1930,1920,1910,1900,1880]
    df_compare = {}
    for year in year_list:
      df_compare[str(year)+'_population'] = eval('df_'+str(year)+'_clean')
  
  import branca
  import branca.colormap as cm
  colormap = cm.LinearColormap(colors=colors, vmin=0.01, vmax=2+len(bins))

  map = folium.Map(location=location, tiles=None, zoom_start=13)
  base_map = FeatureGroup(name='Base Map', overlay=True, control=False)
  folium.TileLayer(tiles=tiles).add_to(base_map)
  base_map.add_to(map)

  for k, v in df_compare.items():
    layer = FeatureGroup(name=k, overlay=False)
    v = label_cluster(v, grid_points, bins=bins)
    for each in v.iterrows():
      folium.CircleMarker([each[1]['Latitude'],each[1]['Longitude']], radius=2**np.log2(each[1]['Size']), 
                        color=colormap(each[1]['Size']), fill_color=colormap(each[1]['Size']), fill_opacity=1,
                        popup="There are "+str(each[1]['RecordId'])+" residents around this cluster center.").add_to(layer)
    layer.add_to(map)

  colormap.add_to(map)
  folium.LayerControl(collapsed=False).add_to(map)

  if name is not None:
        map.save(r'cluster_'+name+'.html')
        print('Map saved.')

  return map

In [None]:
def geo_field_compare(df, field, grid_points, bins=[100,1000,2000], colors=['lavender', 'thistle', 'mediumpurple', 'blueviolet', 'indigo'], 
                      location=[40.767937, -73.982155], tiles='cartodbpositron', name=None):
  '''returns a map with cluster centroids showing the population density of a specified field in a dataset (time)
  '''
  df.dropna(axis=0, subset=[field], inplace=True)
  d = {}
  for value in df.groupby(field)[['RecordId']].count().reset_index().sort_values(by='RecordId', ascending=False)[field].tolist()[:10]:
    d[value] = df[df[field]==value]
  return geo_cluster_compare(d, grid_points, bins, colors, location, tiles, name)

In [None]:
def geo_compare_dual(d1, d2, grid_points, bins=[100,1000,10000], colors=['aliceblue','lightblue','skyblue','mediumblue','darkblue'], 
                     location=[40.767937,-73.982155], tiles="cartodbpositron", name=None):
  '''returns a dual map.
  '''
  if name is None:
    print('This map is too large to be loaded directly in notebook. Please give a file name to be saved separately.')
    return

  import branca
  import branca.colormap as cm
  colormap = cm.LinearColormap(colors=colors, vmin=0.01, vmax=2+len(bins))

  map = plugins.DualMap(location=[40.767937,-73.982155], tiles=None, zoom_start=13)
  base_map = FeatureGroup(name='Base Map', overlay=True, control=False)
  folium.TileLayer(tiles="cartodbpositron").add_to(base_map)
  base_map.add_to(map)

  for k, v in d1.items():
    layer = FeatureGroup(name=k, overlay=False)
    v = label_cluster(v, grid_points, bins=bins)
    for each in v.iterrows():
      folium.CircleMarker([each[1]['Latitude'],each[1]['Longitude']], radius=2**np.log2(each[1]['Size']), 
                        color=colormap(each[1]['Size']), fill_color=colormap(each[1]['Size']), fill_opacity=1,
                        popup="There are "+str(each[1]['RecordId'])+" residents around this cluster center.").add_to(layer)
    layer.add_to(map.m1)

  for k, v in d2.items():
    layer = FeatureGroup(name=k, overlay=False)
    v = label_cluster(v, grid_points, bins=bins)
    for each in v.iterrows():
      folium.CircleMarker([each[1]['Latitude'],each[1]['Longitude']], radius=2**np.log2(each[1]['Size']), 
                        color=colormap(each[1]['Size']), fill_color=colormap(each[1]['Size']), fill_opacity=1,
                        popup="There are "+str(each[1]['RecordId'])+" residents around this cluster center.").add_to(layer)
    layer.add_to(map.m2)
  
  # colormap.add_to(map)
  folium.LayerControl(collapsed=False).add_to(map)

  map.save(r'cluster_'+name+'.html')
  print('Map saved.')

  return map

In [None]:
def geo_value_compare(field, value, grid_points, bins=[100,1000,2000], colors=['aliceblue', 'lightblue','skyblue','mediumblue', 'darkblue'], 
                      location=[40.767937, -73.982155], tiles='cartodbpositron', name=None):
  '''returns a map with cluster centroids showing the population density of a specified value of a field by time (datasets)
  '''
  year_list = [1940,1930,1920,1910,1900,1880]
  if type(value) != list:
    d = {}
    for year in year_list:
      df = eval('df_'+str(year)+'_clean')
      d[str(year)+'_'+field+' is '+value] = df[df[field]==value]
    return geo_cluster_compare(d, grid_points, bins, colors, location, tiles, name)

  elif type(value) == list:
    if len(value) != 2:
      print('Error of field values. Only 2 inputs in a value list allowed.')
      return
    elif type(value[0]) == str and type(value[1]) == str:
      d1 = {}
      d2 = {}
      for year in year_list:
        df = eval('df_'+str(year)+'_clean')
        d1[str(year)+'_'+field+' is '+value[0]] = df[df[field]==value[0]]
        d2[str(year)+'_'+field+' is '+value[1]] = df[df[field]==value[1]]
      return geo_compare_dual(d1, d2, grid_points, bins, colors, location, tiles, name)

# 1. Preparation

## 1.1 Generate cluster centroids

In [None]:
df_nybb = gpd.read_file(gpd.datasets.get_path('nybb'))
df_nybb = df_nybb.to_crs(epsg=4326)
manhattan_map = df_nybb[df_nybb['BoroName']=='Manhattan']

In [None]:
grid_points = generate_grid_in_polygon(0.002, manhattan_map)
grid_points.head()

Unnamed: 0,geometry,Longitude,Latitude,ID
0,POINT (-74.02400 40.68400),-74.024,40.684,0
1,POINT (-74.01200 40.68400),-74.012,40.684,1
2,POINT (-74.02600 40.68600),-74.026,40.686,2
3,POINT (-74.02400 40.68600),-74.024,40.686,3
4,POINT (-74.02200 40.68600),-74.022,40.686,4


In [None]:
geo_scatter_plot(grid_points)

- Each blue dot on the map represents a "cluster centroid"
- There are 1574 "cluster centroids" in Manhattan
- **Notice**: the points are not fully equally distanced

## 1.2 Load Raw Data

In [None]:
path = '/content/drive/MyDrive/geocoded_clean_data/'
df_1940 = pd.read_csv(path + 'geocoded_census_1940.csv')
df_1930 = pd.read_csv(path + 'geocoded_census_1930.csv')
df_1920 = pd.read_csv(path + 'geocoded_census_1920.csv')
df_1910 = pd.read_csv(path + 'geocoded_census_1910.csv')
df_1900 = pd.read_csv(path + 'geocoded_census_1900.csv')
df_1880 = pd.read_csv(path + 'geocoded_census_1880.csv')

In [None]:
df_1940_clean = df_cleaner(df_1940, 1940)
df_1930_clean = df_cleaner(df_1930, 1930)
df_1920_clean = df_cleaner(df_1920, 1920)
df_1910_clean = df_cleaner(df_1910, 1910)
df_1900_clean = df_cleaner(df_1900, 1900)
df_1880_clean = df_cleaner(df_1880, 1880)

## 1.3 Assign Clusters

After we customized our cluster centroids (named as *grid_points*), we need to assign each data point in the dataset to its closest cluster centroid.
<br>
**This step takes a bit longer time than normal (8~10 mins for each dataset).** I recommend that once you finished this process, save the data frames to local/cloud drives so that you don't need to wait for the process next time you do it.

In [None]:
df_1940_clean = assign_cluster(df_1940_clean, grid_points)

In [None]:
df_1930_clean = assign_cluster(df_1930_clean, grid_points)

In [None]:
df_1920_clean = assign_cluster(df_1920_clean, grid_points)

In [None]:
df_1910_clean = assign_cluster(df_1910_clean, grid_points)

In [None]:
df_1900_clean = assign_cluster(df_1900_clean, grid_points)

In [None]:
df_1880_clean = assign_cluster(df_1880_clean, grid_points)

**Save the coded files separately in avoidance of repetitive computing for next time**

In [None]:
#path = '/content/drive/MyDrive/Colab Notebooks/'
#year_list = [1940, 1930, 1920, 1910, 1900, 1880] 
#for year in year_list:
#  eval('df_'+str(year)+'_clean').to_csv(path + 'df_'+str(year)+'_clean.csv', index=False)

In [None]:
path = '/content/drive/MyDrive/Maps_Functions/df_clean/'
df_1940_clean = pd.read_csv(path + 'df_1940_clean.csv')
df_1930_clean = pd.read_csv(path + 'df_1930_clean.csv')
df_1920_clean = pd.read_csv(path + 'df_1920_clean.csv')
df_1910_clean = pd.read_csv(path + 'df_1910_clean.csv')
df_1900_clean = pd.read_csv(path + 'df_1900_clean.csv')
df_1880_clean = pd.read_csv(path + 'df_1880_clean.csv')

# 2. Use Cases

## 2.1 Cluster Plot

In [None]:
m = geo_cluster_plot(df_1930_clean, grid_points, bins=[100, 1000, 10000], name='1930')
m

As we can see from the map:
- Each blue dot represents a "cluster centroid"
- Sizes and colors indicate the number of data points "in" a cluster, i.e. the cluster is the closest to these points. The larger the size and the deeper the color, the more data points a cluster contains.
  - Sizes and colors are categorized into certain degrees, as is shown on the upper-right color bar.
  - You can customize the degrees by setting the parameter **bins**. Each value in **bins** is a threshold that separates the clusters by the numbers of data points they have. (Default value of **bins** is [100,1000,10000], which means clusters will be separated into *below 100, 100-1000, 1000-10000, and above 10000*.
  - You can also customize the **colors** shown on the map. The **colors** parameter should contain at least two values.
- By clicking on each cluster centroid, you will see a pop up message telling the exact number of data points (residents) in this cluste.
- If you want to save the map as a separate file, you should give **name** parameter a value, which will be the name of a .html file.

In [None]:
geo_cluster_plot(df_1880_clean, grid_points, name='1880')

## 2.2 Compare Cluster Maps

In [None]:
df_compare = {'1940_population': df_1940_clean, '1930_population': df_1930_clean, '1920_population': df_1920_clean, 
                '1910_population': df_1910_clean, '1900_population': df_1900_clean, '1880_population': df_1880_clean}

In [None]:
geo_cluster_compare(df_compare, grid_points, name='all')

To easier compare different maps with similar structure, you can give a dictionary to the **df_compare** parameter, whose keys will be layer names and the items be the datasets.
<br>
As we can see from the map:
- From 1880 to 1900s, it's obvious that residents were moving from the lower (southern) part to the upper (northern). 
- **Notice**: the number of data points shown on the map cannot fully represent the real Census data, since some of the data points were unable to be geo-codede and thus mapped. Hence the data points “increasing" or "decreasing" on the map, across different times, cannot indicate the same conclusion for the actual population changes.
- Besides the trend of population moving from south to the north, we can see there are some "population centers" among the clusters, such as China town in the southeast.

## 2.3 Compare Different Values in Specified Field

In [None]:
geo_field_compare(df_1930_clean, 'Race', grid_points, bins=[100, 1000, 2000], name='compare_race')

As we can see:
- White and Black residents have obvious population centers (in 1930). Since White residents take the majority of overall population, the population centers shown in 'White' layer are quite similar to those of the overall 1930 dataset. 
- In comparison, although Black residents are the second largest group, the population centers are mainly in the northern part.
- Chinese residents mainly live around the China town. For the rest groups of races, there are no distinct population centers.

In [None]:
geo_field_compare(df_1930_clean, 'Birth Country DV1', grid_points, bins=[100, 1000, 10000], name='compare_birth')

As we can see:
- Residents born in U.S. take the majority of overall population, hence their population centers are similar to those of the 1930 dataset.
- Residents born in Italy have distinct population centers in the northeast part, while residents born in Ireland tend to live near the two sides of central part. Residents from both Russia and Poland have population centers around the China town.

In [None]:
geo_field_compare(df_1880_clean, 'Race', grid_points, bins=[100, 1000, 2000])

## 2.4 Compare Specified Value by Time

In [None]:
geo_value_compare('Race', 'Black', grid_points, bins=[100, 1000, 2000], name='race_black')

As we compare the population of Black residents across different times, we can see that starting from 1920s, Black residents began to form population centers in the north.

In [None]:
# This map is too big to be loaded in the notebook
geo_value_compare('Race', ['Black', 'Asian'], grid_points, name='race_black_asian')

Map saved.


# Appendix.