In [1]:
import os
os.environ['PROJ_LIB'] = '/Users/Jamie/Documents/CUMC Research/'

In [2]:
import numpy as np
import pandas as pd
from pyzipcode import ZipCodeDatabase

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from mpl_toolkits.basemap import Basemap
from matplotlib import pyplot as plt

In [3]:
zcdb = ZipCodeDatabase()

In [4]:
crit = pd.read_csv('all_criteria.csv')
zip_codes = pd.read_csv('aact_trial_info.csv') \
                .assign(zip_codes=lambda z: z.zip_codes.str.split("|"))
zip_codes = pd.DataFrame({'nct_id':np.repeat(zip_codes.nct_id.values, zip_codes.zip_codes.str.len()),
                          'zip_code':np.concatenate(zip_codes.zip_codes.values)}) \
                .assign(zip_code=lambda row: row.zip_code.astype(str).str[:5])
#     .assign(zip_codes=lambda row: [z[0:5] for z in row.zip_codes]) \
#     .dropna(axis=0, subset=['zip_codes'])
#     .assign(lat=lambda row: [get_lats(row.zip_codes)])

In [5]:
def get_lats(codes):
    lats = []
    for code in codes:
        try:
            lats.append(zcdb[code].latitude)
        except:
            continue
    return lats

def get_lat(code):
    try:
        lat = zcdb[code].latitude
    except IndexError:
        lat = np.nan
    finally:
        return lat
    
def get_long(code):
    try:
        long = zcdb[code].longitude
    except IndexError:
        long = np.nan
    finally:
        return long

def get_city(code):
    count = np.nan
    try:
        count = zcdb[code].city
    except IndexError:
        count = np.nan
    finally:
        return count

def get_state(code):
    try:
        stat = zcdb[code].state
    except IndexError:
        stat = np.nan
    finally:
        return stat

def get_nearby_codes(code, radius=10):  # codes is a single zip code, radius is a number
    try:
        zips = [z.zip for z in zcdb.get_zipcodes_around_radius(code, radius)]
        final = '|'.join(zips)
    except:
        final = np.nan
    finally:
        return final

In [69]:
X = pd.DataFrame({'nct_id':np.repeat(info.nct_id.values, info.zip_codes.str.len()),
                  'zip_code':np.concatenate(info.zip_codes.values)})
def get_metrics(df):
    return df \
            .assign(lat=lambda row: row.zip_code.map(get_lat)) \
            .assign(long=lambda row: row.zip_code.map(get_long)) \
            .assign(city=lambda row: row.zip_code.map(get_city)) \
            .assign(state=lambda row: row.zip_code.map(get_state)) \
            .dropna()

In [100]:
fzip_codes  # get nct_id -> nct_id mapping

Unnamed: 0,nct_id,zip_code
0,NCT04323761,07102
1,NCT04323761,07450
2,NCT04323761,07666
3,NCT04323761,07728
4,NCT04323761,07740
...,...,...
559,NCT03808922,90024
560,NCT03808922,91010
561,NCT03808922,97239
562,NCT03808922,98105


In [53]:
Y = X.assign(nearby_zips=lambda row: row.zip_code.map(get_nearby_codes)) \
        .assign(nearby_zips=lambda row: row.nearby_zips.str.split("|"))
Z = pd.DataFrame({'nct_id': np.repeat(Y.nct_id.values, Y.nearby_zips.str.len()),
                  'zip_code': np.repeat(Y.zip_code.values, Y.nearby_zips.str.len()),
                  'nearby_zips': np.concatenate(Y.nearby_zips.values)})

ValueError: repeats may not contain negative values.

In [27]:
# from urllib.request import urlopen
# import json
# # with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
# #     counties = json.load(response)

# # import pandas as pd
# # df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
# #                    dtype={"fips": str})

# import plotly.express as px

# fig = px.choropleth(X, locations='state', color='lat', # geojson=states, 
#                            color_continuous_scale="Viridis",
#                            range_color=(0, 12),
#                            scope="usa",
#                            labels={'unemp':'unemployment rate'}
#                           )
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()

In [83]:
# Y = pd.get_dummies(crit[['nct_id', 'concept_name', 'domain', 'is_exclusion']] \
#                         .join(zip_codes[['nct_id']].set_index('nct_id'), on='nct_id', how='left'),
#                    columns=['concept_name']) \
#       .drop(labels=['domain'], axis=1)

Y = pd.get_dummies(crit[['nct_id', 'is_exclusion', 'concept_name']], columns=['concept_name'])

def get_aggregation(df):
    return df \
            .drop(labels=['is_exclusion'], axis=1) \
            .groupby(by=['nct_id'], axis=0) \
            .aggregate(func=np.sum)

exc = get_aggregation(Y[Y['is_exclusion'] == 1])
inc = get_aggregation(Y[Y['is_exclusion'] == 0])

Z = pd.merge(exc, inc, on='nct_id', how='outer', suffixes=('_exc', '_inc')) \
        .fillna(0)

assert exc.to_numpy().sum() + inc.to_numpy().sum() == crit.shape[0]  # still have same number of criteria?
assert Z.to_numpy().sum() == crit.shape[0]

In [135]:
def compute_similarity(a, b):
    x = Z.loc[a]
    y = Z.loc[b]
    return np.sum(x * y) / np.sum(x)

def similarity_main(series_a, series_b):  # not a good way to do this; takes a while
    temp = []
    for a, b in zip(series_a, series_b):
        try:
            temp.append(compute_similarity(a, b))
        except:
            temp.append(np.nan)
    return pd.Series(temp)

index = pd.MultiIndex.from_product([pd.unique(zip_codes.nct_id), pd.unique(zip_codes.nct_id)],
                                   names = ["trial_a", "trial_b"])
index = pd.DataFrame(index=index).reset_index()
index = index.assign(similarity = similarity_main(index.trial_a, index.trial_b))
#             .assign(similarity = lambda row: compute_similarity(row.trial_a, row.trial_b))

In [160]:
index.pivot('trial_a', 'trial_b', 'similarity')  # final

trial_b,NCT03808922,NCT04280705,NCT04283461,NCT04292730,NCT04292899,NCT04305457,NCT04306393,NCT04308668,NCT04311177,NCT04311697,...,NCT04348864,NCT04349202,NCT04349371,NCT04349410,NCT04349631,NCT04350073,NCT04350450,NCT04350476,NCT04350593,NCT04351620
trial_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCT03808922,,,,,,,,,,,...,,,,,,,,,,
NCT04280705,,1.210526,0.210526,0.210526,0.210526,0.052632,0.105263,0.105263,0.368421,0.105263,...,0.052632,,0.105263,0.052632,0.000000,0.105263,0.052632,0.052632,0.263158,0.210526
NCT04283461,,0.025478,1.789809,0.000000,0.000000,0.012739,0.000000,0.000000,0.050955,0.000000,...,0.000000,,0.025478,0.000000,0.012739,0.000000,0.000000,0.000000,0.012739,0.006369
NCT04292730,,0.444444,0.000000,1.222222,1.000000,0.222222,0.222222,0.222222,0.222222,0.222222,...,0.222222,,0.000000,0.222222,0.000000,0.222222,0.222222,0.222222,0.333333,0.222222
NCT04292899,,0.307692,0.000000,0.692308,1.153846,0.076923,0.076923,0.076923,0.153846,0.153846,...,0.076923,,0.000000,0.076923,0.000000,0.076923,0.076923,0.076923,0.307692,0.076923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCT04350073,,0.285714,0.000000,0.285714,0.142857,0.142857,0.285714,0.142857,0.000000,0.285714,...,0.142857,,0.000000,0.142857,0.000000,1.000000,0.142857,0.142857,0.285714,0.142857
NCT04350450,,0.055556,0.000000,0.111111,0.055556,0.222222,0.055556,0.111111,0.111111,0.055556,...,0.055556,,0.000000,0.055556,0.055556,0.055556,1.000000,0.055556,0.222222,0.500000
NCT04350476,,1.000000,0.000000,2.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1.000000,...,1.000000,,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000
NCT04350593,,0.151515,0.060606,0.090909,0.121212,0.060606,0.030303,0.060606,0.090909,0.030303,...,0.030303,,0.000000,0.030303,0.000000,0.060606,0.121212,0.030303,1.060606,0.151515


In [54]:
# import geopandas
# import shapely
# import shapefile
# import plotly
# from plotly.figure_factory import create_choropleth
# # import xlrd

# # import plotly.figure_factory as ff
# # import geopandas

# fips = ['06021', '06023', '06027',
#         '06029', '06033', '06059',
#         '06047', '06049', '06051',
#         '06055', '06061']
# values = range(len(fips))

# fig = create_choropleth(fips=fips, values=values)
# fig.layout.template = None
# fig.show()