In [2]:
# import os
# os.environ['PROJ_LIB'] = '/Users/Jamie/Documents/CUMC Research/'

In [3]:
import numpy as np
import pandas as pd
from pyzipcode import ZipCodeDatabase

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from mpl_toolkits.basemap import Basemap
from matplotlib import pyplot as plt

In [4]:
zcdb = ZipCodeDatabase()

In [23]:
crit = pd.read_csv('data/all_criteria_for_azure.csv')
zip_codes = pd.read_csv('data/aact_trial_info_for_azure.csv')
#                 .assign(zip_codes=lambda z: z.zip_codes.str.split("|"))
# zip_codes = pd.DataFrame({'nct_id':np.repeat(zip_codes.nct_id.values, zip_codes.zip_codes.str.len()),
#                           'zip_code':np.concatenate(zip_codes.zip_codes.values)}) \
#                 .assign(zip_code=lambda row: row.zip_code.astype(str).str[:5])
# zip_codes = pd.DataFrame({'nct_id': np.repeat(zip_codes.nct_id)})
#     .assign(zip_codes=lambda row: [z[0:5] for z in row.zip_codes]) \
#     .dropna(axis=0, subset=['zip_codes'])
#     .assign(lat=lambda row: [get_lats(row.zip_codes)])

In [24]:
def get_lats(codes):
    lats = []
    for code in codes:
        try:
            lats.append(zcdb[code].latitude)
        except:
            continue
    return lats

def get_lat(code):
    try:
        lat = zcdb[code].latitude
    except IndexError:
        lat = np.nan
    finally:
        return lat
    
def get_long(code):
    try:
        long = zcdb[code].longitude
    except IndexError:
        long = np.nan
    finally:
        return long

def get_city(code):
    count = np.nan
    try:
        count = zcdb[code].city
    except IndexError:
        count = np.nan
    finally:
        return count

def get_state(code):
    try:
        stat = zcdb[code].state
    except IndexError:
        stat = np.nan
    finally:
        return stat

def get_nearby_codes(code, radius=10):  # codes is a single zip code, radius is a number
    try:
        zips = [z.zip for z in zcdb.get_zipcodes_around_radius(code, radius)]
        final = '|'.join(zips)
    except:
        final = np.nan
    finally:
        return final

In [25]:
# X = pd.DataFrame({'nct_id':np.repeat(info.nct_id.values, info.zip_codes.str.len()),
#                   'zip_code':np.concatenate(info.zip_codes.values)})
# def get_metrics(df):
#     return df \
#             .assign(lat=lambda row: row.zip_code.map(get_lat)) \
#             .assign(long=lambda row: row.zip_code.map(get_long)) \
#             .assign(city=lambda row: row.zip_code.map(get_city)) \
#             .assign(state=lambda row: row.zip_code.map(get_state)) \
#             .dropna()

In [26]:
# fzip_codes  # get nct_id -> nct_id mapping

In [27]:
# Y = X.assign(nearby_zips=lambda row: row.zip_code.map(get_nearby_codes)) \
#         .assign(nearby_zips=lambda row: row.nearby_zips.str.split("|"))
# Z = pd.DataFrame({'nct_id': np.repeat(Y.nct_id.values, Y.nearby_zips.str.len()),
#                   'zip_code': np.repeat(Y.zip_code.values, Y.nearby_zips.str.len()),
#                   'nearby_zips': np.concatenate(Y.nearby_zips.values)})

In [28]:
# from urllib.request import urlopen
# import json
# # with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
# #     counties = json.load(response)

# # import pandas as pd
# # df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
# #                    dtype={"fips": str})

# import plotly.express as px

# fig = px.choropleth(X, locations='state', color='lat', # geojson=states, 
#                            color_continuous_scale="Viridis",
#                            range_color=(0, 12),
#                            scope="usa",
#                            labels={'unemp':'unemployment rate'}
#                           )
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()

In [29]:
# Y = pd.get_dummies(crit[['nct_id', 'concept_name', 'domain', 'is_exclusion']] \
#                         .join(zip_codes[['nct_id']].set_index('nct_id'), on='nct_id', how='left'),
#                    columns=['concept_name']) \
#       .drop(labels=['domain'], axis=1)

Y = pd.get_dummies(crit[['nct_id', 'is_exclusion', 'concept_name']], columns=['concept_name'])

def get_aggregation(df):
    return df \
            .drop(labels=['is_exclusion'], axis=1) \
            .groupby(by=['nct_id'], axis=0) \
            .aggregate(func=np.sum)

exc = get_aggregation(Y[Y['is_exclusion'] == 1])
inc = get_aggregation(Y[Y['is_exclusion'] == 0])

Z = pd.merge(exc, inc, on='nct_id', how='outer', suffixes=('_exc', '_inc')) \
        .fillna(0)

assert exc.to_numpy().sum() + inc.to_numpy().sum() == crit.shape[0]  # still have same number of criteria?
assert Z.to_numpy().sum() == crit.shape[0]

In [30]:
def compute_similarity(a, b):
    x = np.where(Z.loc[a] != 0, 1, 0)
    y = np.where(Z.loc[b] != 0, 1, 0)
    return np.sum(x * y) / np.sum(x)

def similarity_main(series_a, series_b):  # not a good way to do this; takes a while
    temp = []
    for a, b in zip(series_a, series_b):
        try:
            temp.append(compute_similarity(a, b))
        except:
            temp.append(np.nan)
    return pd.Series(temp)

index = pd.MultiIndex.from_product([pd.unique(zip_codes.nct_id), pd.unique(zip_codes.nct_id)],
                                   names = ["trial_a", "trial_b"])
index = pd.DataFrame(index=index).reset_index()
index = index.assign(similarity = similarity_main(index.trial_a, index.trial_b))
#             .assign(similarity = lambda row: compute_similarity(row.trial_a, row.trial_b))

In [31]:
index.pivot('trial_a', 'trial_b', 'similarity').iloc[1:6, 1:6]  # final

trial_b,NCT03648372,NCT03808922,NCT03852537,NCT04278404,NCT04280705
trial_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NCT03648372,1.0,0.145455,0.036364,0.0,0.018182
NCT03808922,0.137931,1.0,0.034483,0.0,0.103448
NCT03852537,0.105263,0.105263,1.0,0.0,0.0
NCT04278404,0.0,0.0,0.0,1.0,0.125
NCT04280705,0.076923,0.461538,0.0,0.076923,1.0


In [54]:
# import geopandas
# import shapely
# import shapefile
# import plotly
# from plotly.figure_factory import create_choropleth
# # import xlrd

# # import plotly.figure_factory as ff
# # import geopandas

# fips = ['06021', '06023', '06027',
#         '06029', '06033', '06059',
#         '06047', '06049', '06051',
#         '06055', '06061']
# values = range(len(fips))

# fig = create_choropleth(fips=fips, values=values)
# fig.layout.template = None
# fig.show()