#  Find Clusters of Infected People

In [None]:
import cudf
import cuml

import cupy as cp

Begin by loading the data you've received about week 1 of the outbreak into a cuDF data frame. The data is located at `'./data/week1.csv'`. For this notebook you will only need the `'lat'`, `'long'`, and `'infected'` columns. Either drop the columns after loading, or use the `cudf.read_csv` named argument `usecols` to provide a list of only the columns you need.

In [None]:
week1_df = cudf.read_csv('./data/week1.csv', usecols=['lat', 'long', 'infected'])
display(week1_df.head())

Make a new cuDF data frame `infected_df` that contains only the infected members of the population.

In [None]:
infected_df = week1_df[week1_df['infected'] == True].copy()
display(infected_df.head())

Provided for you in the next cell (which you can expand by clicking on the "..." and contract again after executing by clicking on the blue left border of the cell) is the lat/long to OSGB36 grid coordinates converter you used earlier in the workshop. Use this converter to create grid coordinate values stored in `northing` and `easting` columns of the `infected_df` you created in the last step.

In [None]:
# https://www.ordnancesurvey.co.uk/docs/support/guide-coordinate-systems-great-britain.pdf

def latlong2osgbgrid_cupy(lat, long, input_degrees=True):
    '''
    Converts latitude and longitude (ellipsoidal) coordinates into northing and easting (grid) coordinates, using a Transverse Mercator projection.

    Inputs:
    lat: latitude coordinate (N)
    long: longitude coordinate (E)
    input_degrees: if True (default), interprets the coordinates as degrees; otherwise, interprets coordinates as radians

    Output:
    (northing, easting)
    '''

    if input_degrees:
        lat = lat * cp.pi/180
        long = long * cp.pi/180

    a = 6377563.396
    b = 6356256.909
    e2 = (a**2 - b**2) / a**2

    N0 = -100000 # northing of true origin
    E0 = 400000 # easting of true origin
    F0 = .9996012717 # scale factor on central meridian
    phi0 = 49 * cp.pi / 180 # latitude of true origin
    lambda0 = -2 * cp.pi / 180 # longitude of true origin and central meridian

    sinlat = cp.sin(lat)
    coslat = cp.cos(lat)
    tanlat = cp.tan(lat)

    latdiff = lat-phi0
    longdiff = long-lambda0

    n = (a-b) / (a+b)
    nu = a * F0 * (1 - e2 * sinlat ** 2) ** -.5
    rho = a * F0 * (1 - e2) * (1 - e2 * sinlat ** 2) ** -1.5
    eta2 = nu / rho - 1
    M = b * F0 * ((1 + n + 5/4 * (n**2 + n**3)) * latdiff -
                  (3*(n+n**2) + 21/8 * n**3) * cp.sin(latdiff) * cp.cos(lat+phi0) +
                  15/8 * (n**2 + n**3) * cp.sin(2*(latdiff)) * cp.cos(2*(lat+phi0)) -
                  35/24 * n**3 * cp.sin(3*(latdiff)) * cp.cos(3*(lat+phi0)))
    I = M + N0
    II = nu/2 * sinlat * coslat
    III = nu/24 * sinlat * coslat ** 3 * (5 - tanlat ** 2 + 9 * eta2)
    IIIA = nu/720 * sinlat * coslat ** 5 * (61-58 * tanlat**2 + tanlat**4)
    IV = nu * coslat
    V = nu / 6 * coslat**3 * (nu/rho - cp.tan(lat)**2)
    VI = nu / 120 * coslat ** 5 * (5 - 18 * tanlat**2 + tanlat**4 + 14 * eta2 - 58 * tanlat**2 * eta2)

    northing = I + II * longdiff**2 + III * longdiff**4 + IIIA * longdiff**6
    easting = E0 + IV * longdiff + V * longdiff**3 + VI * longdiff**5

    return(northing, easting)

In [None]:
infected_df['northing'], infected_df['easting'] = latlong2osgbgrid_cupy(infected_df['lat'], infected_df['long'])
display(infected_df.head())

Use DBSCAN to find clusters of at least 25 infected people where no member is more than 2000m from at least one other cluster member. Create a new column in `infected_df` which contains the cluster to which each infected person belongs.

In [None]:
from cuml.cluster import DBSCAN

# eps is the maximum distance between two samples for one to be considered as in the neighborhood of the other.
# min_samples is the number of samples in a neighborhood for a point to be considered as a core point.
dbscan = DBSCAN(eps=2000, min_samples=25)

infected_df['cluster'] = dbscan.fit_predict(infected_df[['easting', 'northing']])
display(infected_df.head())

Use grouping to find the mean `northing` and `easting` values for each cluster identified above.

In [None]:
cluster_centers = infected_df.groupby('cluster')[['northing', 'easting']].mean()
display(cluster_centers)

Find the number of people in each cluster by counting the number of appearances of each cluster's label in the column produced by DBSCAN.

In [None]:
cluster_counts = infected_df['cluster'].value_counts()
display(cluster_counts)

# Identify Nearest Health Facilities

Begin by loading the `lat`, `long` and `infected` columns from `'./data/week2.csv'` into a cuDF data frame called `gdf`.

In [None]:
gdf = cudf.read_csv('./data/week2.csv', usecols=['lat', 'long', 'infected'])
display(gdf.head())

For this step, your goal is to create an `all_med` cuDF data frame that contains the latitudes and longitudes of all the hospitals (data found at `'./data/hospitals.csv'`) and clinics (data found at `'./data/clinics.csv'`).

In [None]:
hospitals_df = cudf.read_csv('./data/hospitals.csv')
clinics_df = cudf.read_csv('./data/clinics.csv')

all_med = cudf.concat([hospitals_df, clinics_df], ignore_index=True)
display(all_med.head())

Since we will be using the coordinates of those facilities, keep only those rows that are non-null in both  `Latitude` and `Longitude`.

In [None]:
all_med = all_med.dropna(subset=['Latitude', 'Longitude'])
display(all_med.head())

Provided for you in the next cell (which you can expand by clicking on the "...", and contract again after executing by clicking on the blue left border of the cell) is the lat/long to grid coordinates converter you have used earlier in the workshop. Use this converter to create grid coordinate values stored in `northing` and `easting` columns of the `all_med` data frame you created in the last step.

In [None]:
# Fit `cuml.NearestNeighbors` with `all_med`'s `northing` and `easting` values, using the named argument `n_neighbors` set to `1`, and save the model as `knn`.

In [None]:
all_med['northing'], all_med['easting'] = latlong2osgbgrid_cupy(all_med['Latitude'], all_med['Longitude'])
display(all_med.head())

Save every infected member in `gdf` into a new dataframe called `infected_gdf`.

In [None]:
infected_gdf = gdf[gdf['infected'] == True].copy()
display(infected_gdf.head())

Create `northing` and `easting` values for `infected_gdf`.

In [None]:
infected_gdf['northing'], infected_gdf['easting'] = latlong2osgbgrid_cupy(infected_gdf['lat'], infected_gdf['long'])
display(infected_gdf.head())

Use `knn.kneighbors` with `n_neighbors=1` on `infected_gdf`'s `northing` and `easting` values. Save the return values in `distances` and `indices`.

In [None]:
from cuml.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=1)
knn.fit(all_med[['northing', 'easting']])

`indices`, returned from your use of `knn.kneighbors` immediately above, should map person indices to their closest clinic/hospital indices:

In [None]:
indices.head()

Here you can print an infected individual's coordinates from `infected_gdf`:

In [None]:
infected_gdf.iloc[0]

You should be able to used the mapped index for the nearest facility to see that indeed the nearest facility is at a nearby coordinate:

In [None]:
all_med.iloc[1234]

In [None]:
distances, indices = knn.kneighbors(infected_gdf[['northing', 'easting']])

#  Identify Risk Factors for Infection

Begin by loading the data you've received about week 3 of the outbreak into a cuDF data frame. The data is located at `./data/week3.csv`. For this notebook you will need all columns of the data.

In [None]:
week3_df = cudf.read_csv('./data/week3.csv')
display(week3_df.head())

## Calculate Infection Rates by Employment Code

Convert the `infected` column to type `float32`. For people who are not infected, the float32 `infected` value should be `0.0`, and for infected people it should be `1.0`.

In [None]:
week3_df['infected'] = week3_df['infected'].astype('float32')
display(week3_df.head())

Now, produce a list of employment types and their associated **rates** of infection, sorted from highest to lowest rate of infection.

**NOTE**: The infection **rate** for each employment type should be the percentage of total individuals within an employment type who are infected. Therefore, if employment type "X" has 1000 people, and 10 of them are infected, the infection **rate** would be .01. If employment type "Z" has 10,000 people, and 50 of them are infected, the infection rate would be .005, and would be **lower** than for type "X", even though more people within that employment type were infected.

In [None]:
infection_rates_by_employment = week3_df.groupby('employment')['infected'].mean().sort_values(ascending=False)
display(infection_rates_by_employment)

Finally, read in the employment codes guide from `./data/code_guide.csv` to interpret which employment types are seeing the highest rates of infection.

In [None]:
employment_code_guide = cudf.read_csv('./data/code_guide.csv')
display(employment_code_guide.head())

## Calculate Infection Rates by Employment Code and Sex

We want to see if there is an effect of `sex` on infection rate, either in addition to `employment` or confounding it. Group by both `employment` and `sex` simultaneously to get the infection rate for the intersection of those categories.

In [None]:
infection_rates_by_employment_and_sex = week3_df.groupby(['employment', 'sex'])['infected'].mean().sort_values(ascending=False)
display(infection_rates_by_employment_and_sex)