In this notebook, you'll see how to connect to a Postgres database using the sqlalchemy library.

For this notebook, you'll need both the `sqlalchemy` and `psycopg2` libraries installed.

In [2]:
#!pip install psycopg2-binary

In [3]:
#!pip install psycopg2

In [4]:
import pandas as pd

In [5]:
from sqlalchemy import create_engine, text



First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

To connect to the Lahman baseball database, you can use the following connection string.

In [7]:
database_name = 'prescribers'    # Fill this in with your prescribers database name

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

Now, we need to create an engine and use it to connect.

In [9]:
engine = create_engine(connection_string)

sqlalchemy works well with pandas to convert query results into dataframes.

In [11]:
import pandas as pd

First, let's write a meaningful query.

In [13]:
query = 'SELECT * FROM prescriber'

Now, bring it all together using the following syntax.

In [15]:
with engine.connect() as connection:
    people = pd.read_sql(text(query), con = connection)

people.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_zip5,nppes_provider_zip4,nppes_provider_state,nppes_provider_country,specialty_description,description_flag,medicare_prvdr_enroll_status
0,1003000000.0,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,37243,1,TN,US,Nurse Practitioner,S,N
1,1003012000.0,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,37919,2333,TN,US,Pulmonary Disease,S,E
2,1003013000.0,GRABENSTEIN,WILLIAM,P,M.D.,M,I,1822 MEMORIAL DR,,CLARKSVILLE,37043,4605,TN,US,Family Practice,S,E
3,1003014000.0,OTTO,ROBERT,J,M.D.,M,I,2400 PATTERSON STREET SUITE 100,,NASHVILLE,37203,2786,TN,US,Orthopedic Surgery,S,E
4,1003018000.0,TODD,JOSHUA,W,M.D.,M,I,1819 W CLINCH AVE,SUITE 108,KNOXVILLE,37916,2435,TN,US,Cardiology,S,E


In [16]:
people.groupby(['nppes_provider_last_org_name']).nunique().sort_values('npi', ascending = False)

Unnamed: 0_level_0,npi,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_zip5,nppes_provider_zip4,nppes_provider_state,nppes_provider_country,specialty_description,description_flag,medicare_prvdr_enroll_status
nppes_provider_last_org_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
SMITH,279,196,20,46,2,1,254,73,71,120,201,1,1,46,2,3
JOHNSON,156,124,20,39,2,1,144,48,55,90,125,1,1,34,2,3
JONES,152,120,21,39,2,1,146,47,54,90,122,1,1,28,2,3
WILLIAMS,137,104,20,39,2,1,125,47,49,75,112,1,1,32,2,2
MILLER,106,81,20,29,2,1,97,42,37,58,84,1,1,29,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HELDERMAN,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
HELLERVIK,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
HELLGREN,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
HELLMANN,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1


For much more information about SQLAlchemy and to see a more “Pythonic” way to execute queries, see Introduction to Databases in Python: https://www.datacamp.com/courses/introduction-to-relational-databases-in-python

In [18]:
rx_query = 'SELECT * FROM prescription'

In [19]:
with engine.connect() as connection:
    rx = pd.read_sql(text(rx_query), con = connection)

rx.head()

Unnamed: 0,npi,drug_name,bene_count,total_claim_count,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65
0,1427076000.0,RALOXIFENE HCL,,18.0,28.0,840.0,1009.66,,*,18.0,,28.0,840.0,1009.66
1,1003858000.0,GLIMEPIRIDE,,12.0,16.0,480.0,270.86,,*,,*,,,
2,1184627000.0,TAMSULOSIN HCL,,14.0,24.0,698.0,353.62,,#,,#,,,
3,1306111000.0,SPIRIVA,,13.0,13.0,390.0,4783.28,,*,,*,,,
4,1285658000.0,SPIRIVA,,13.0,13.0,390.0,4855.95,,#,,#,,,


In [20]:
rx.shape

(656058, 14)

In [21]:
drug_query = 'SELECT * FROM drug'

In [22]:
with engine.connect() as connection:
    drug_df = pd.read_sql(text(drug_query), con = connection)

drug_df.head()

Unnamed: 0,drug_name,generic_name,opioid_drug_flag,long_acting_opioid_drug_flag,antibiotic_drug_flag,antipsychotic_drug_flag
0,1ST TIER UNIFINE PENTIPS,"PEN NEEDLE, DIABETIC",N,N,N,N
1,1ST TIER UNIFINE PENTIPS PLUS,"PEN NEEDLE, DIABETIC",N,N,N,N
2,ABACAVIR,ABACAVIR SULFATE,N,N,N,N
3,ABACAVIR-LAMIVUDINE,ABACAVIR SULFATE/LAMIVUDINE,N,N,N,N
4,ABACAVIR-LAMIVUDINE-ZIDOVUDINE,ABACAVIR/LAMIVUDINE/ZIDOVUDINE,N,N,N,N


In [23]:
zip_query = 'SELECT * FROM zip_fips'

In [24]:
with engine.connect() as connection:
    zips = pd.read_sql(text(zip_query), con = connection)

zips.head()

Unnamed: 0,zip,fipscounty,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,501,36103,0.0,1.0,0.0,1.0
1,601,72113,0.160724,0.20098,0.128834,0.1625
2,601,72001,0.839276,0.79902,0.871166,0.8375
3,602,72003,1.0,0.9988,1.0,0.999919
4,602,72005,0.0,0.0012,0.0,8.1e-05


In [25]:
cty_query = 'SELECT * FROM fips_county'

In [26]:
with engine.connect() as connection:
    cty_df = pd.read_sql(text(cty_query), con = connection)

cty_df.head()

Unnamed: 0,county,state,fipscounty,fipsstate
0,AUTAUGA,AL,1001,1
1,BALDWIN,AL,1003,1
2,BARBOUR,AL,1005,1
3,BIBB,AL,1007,1
4,BLOUNT,AL,1009,1


In [27]:
zips.sort_values(['zip','tot_ratio'], ascending = False)

Unnamed: 0,zip,fipscounty,res_ratio,bus_ratio,oth_ratio,tot_ratio
54180,99929,02275,0.000000,0.00000,1.000000,1.000000
54179,99928,02130,0.000000,0.00000,1.000000,1.000000
54178,99927,02198,0.000000,0.00000,1.000000,1.000000
54177,99926,02198,0.000000,0.00000,1.000000,1.000000
54176,99925,02198,0.000000,0.00000,1.000000,1.000000
...,...,...,...,...,...,...
3,00602,72003,1.000000,0.99880,1.000000,0.999919
4,00602,72005,0.000000,0.00120,0.000000,0.000081
2,00601,72001,0.839276,0.79902,0.871166,0.837500
1,00601,72113,0.160724,0.20098,0.128834,0.162500


In [28]:
true_zips = zips.sort_values(['zip','tot_ratio'], ascending = False).drop_duplicates(['zip'], keep = 'first').reset_index()
true_zips


Unnamed: 0,index,zip,fipscounty,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,54180,99929,02275,0.000000,0.00000,1.000000,1.000000
1,54179,99928,02130,0.000000,0.00000,1.000000,1.000000
2,54178,99927,02198,0.000000,0.00000,1.000000,1.000000
3,54177,99926,02198,0.000000,0.00000,1.000000,1.000000
4,54176,99925,02198,0.000000,0.00000,1.000000,1.000000
...,...,...,...,...,...,...,...
39456,8,00604,72005,1.000000,1.00000,1.000000,1.000000
39457,5,00603,72005,0.997152,0.99816,1.000000,0.997290
39458,3,00602,72003,1.000000,0.99880,1.000000,0.999919
39459,2,00601,72001,0.839276,0.79902,0.871166,0.837500


In [29]:
zip_cty = pd.merge(cty_df, true_zips[['zip', 'fipscounty']], on='fipscounty', how="left")
zip_cty

Unnamed: 0,county,state,fipscounty,fipsstate,zip
0,AUTAUGA,AL,01001,01,36749
1,AUTAUGA,AL,01001,01,36068
2,AUTAUGA,AL,01001,01,36067
3,AUTAUGA,AL,01001,01,36066
4,AUTAUGA,AL,01001,01,36051
...,...,...,...,...,...
39980,VIEQUES,PR,72147,72,00765
39981,VILLALBA,PR,72149,72,00766
39982,YABUCOA,PR,72151,72,00767
39983,YAUCO,PR,72153,72,00698


In [30]:
zips['zip'].nunique()

39461

In [31]:
cbsa_query = 'SELECT * FROM cbsa'

In [32]:
with engine.connect() as connection:
    cbsa = pd.read_sql(text(cbsa_query), con = connection)

cbsa.head()

Unnamed: 0,fipscounty,cbsa,cbsaname
0,1001,33860,"Montgomery, AL"
1,1003,19300,"Daphne-Fairhope-Foley, AL"
2,1007,13820,"Birmingham-Hoover, AL"
3,1009,13820,"Birmingham-Hoover, AL"
4,1015,11500,"Anniston-Oxford-Jacksonville, AL"


In [33]:
od_query = 'SELECT * FROM overdose_deaths'

In [34]:
with engine.connect() as connection:
    od_df = pd.read_sql(text(od_query), con = connection)

od_df.head()

Unnamed: 0,overdose_deaths,year,fipscounty
0,135,2015,47157
1,150,2016,47157
2,159,2017,47157
3,123,2018,47157
4,122,2015,47093


In [35]:
od_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   overdose_deaths  380 non-null    int64
 1   year             380 non-null    int64
 2   fipscounty       380 non-null    int64
dtypes: int64(3)
memory usage: 9.0 KB


In [36]:
df1 = pd.merge(people, rx, on = 'npi', how = 'left')
df1

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,total_30_day_fill_count,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65
0,1.003000e+09,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,...,34.0,620.0,383.12,,*,32.0,,34.0,620.0,383.12
1,1.003000e+09,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,...,32.0,852.0,276.87,,*,11.0,,21.0,522.0,163.02
2,1.003012e+09,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,...,30.0,900.0,13195.05,,*,30.0,,30.0,900.0,13195.05
3,1.003012e+09,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,...,13.0,359.0,252.30,,*,,*,,,
4,1.003012e+09,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,...,29.0,870.0,10602.62,,*,27.0,,29.0,870.0,10602.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660511,1.992996e+09,GILES,WESLEY,H,MD,M,I,979 E 3RD ST STE 300,,CHATTANOOGA,...,133.0,3990.0,1508.74,,#,89.0,,97.0,2910.0,1010.50
660512,1.992998e+09,DRAPER,DAVID,G,DDS,M,I,202 SHIPLEY STREET,,COOKEVILLE,...,,,,,,,,,,
660513,1.993000e+09,THOMAS,SHELIA,K,"RN, APN",F,I,6266 POPLAR AVE,,MEMPHIS,...,42.0,1230.0,782.27,,*,19.0,,19.0,570.0,292.28
660514,1.993000e+09,THOMAS,SHELIA,K,"RN, APN",F,I,6266 POPLAR AVE,,MEMPHIS,...,12.0,360.0,474.10,0.0,,0.0,,0.0,0.0,0.00


In [37]:
people.shape

(25050, 17)

In [38]:
rx.shape

(656058, 14)

In [39]:
df2 = pd.merge(df1, drug_df, on = 'drug_name', how = 'left')
df2

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,generic_name,opioid_drug_flag,long_acting_opioid_drug_flag,antibiotic_drug_flag,antipsychotic_drug_flag
0,1.003000e+09,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,...,32.0,,34.0,620.0,383.12,CALCITRIOL,N,N,N,N
1,1.003000e+09,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,...,11.0,,21.0,522.0,163.02,ALLOPURINOL,N,N,N,N
2,1.003012e+09,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,...,30.0,,30.0,900.0,13195.05,FLUTICASONE/SALMETEROL,N,N,N,N
3,1.003012e+09,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,...,,*,,,,AZITHROMYCIN,N,N,Y,N
4,1.003012e+09,CUDZILO,COREY,,M.D.,M,I,2240 SUTHERLAND AVE,SUITE 103,KNOXVILLE,...,27.0,,29.0,870.0,10602.62,TIOTROPIUM BROMIDE,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709468,1.992996e+09,GILES,WESLEY,H,MD,M,I,979 E 3RD ST STE 300,,CHATTANOOGA,...,89.0,,97.0,2910.0,1010.50,LEVOTHYROXINE SODIUM,N,N,N,N
709469,1.992998e+09,DRAPER,DAVID,G,DDS,M,I,202 SHIPLEY STREET,,COOKEVILLE,...,,,,,,,,,,
709470,1.993000e+09,THOMAS,SHELIA,K,"RN, APN",F,I,6266 POPLAR AVE,,MEMPHIS,...,19.0,,19.0,570.0,292.28,HYDROCODONE/ACETAMINOPHEN,Y,N,N,N
709471,1.993000e+09,THOMAS,SHELIA,K,"RN, APN",F,I,6266 POPLAR AVE,,MEMPHIS,...,0.0,,0.0,0.0,0.00,OXYCODONE HCL/ACETAMINOPHEN,Y,N,N,N


In [40]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709473 entries, 0 to 709472
Data columns (total 35 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   npi                            709473 non-null  float64
 1   nppes_provider_last_org_name   709424 non-null  object 
 2   nppes_provider_first_name      709473 non-null  object 
 3   nppes_provider_mi              576020 non-null  object 
 4   nppes_credentials              693320 non-null  object 
 5   nppes_provider_gender          709473 non-null  object 
 6   nppes_entity_code              709473 non-null  object 
 7   nppes_provider_street1         709473 non-null  object 
 8   nppes_provider_street2         284978 non-null  object 
 9   nppes_provider_city            709473 non-null  object 
 10  nppes_provider_zip5            709473 non-null  object 
 11  nppes_provider_zip4            612940 non-null  object 
 12  nppes_provider_state          

In [41]:
zip_cty

Unnamed: 0,county,state,fipscounty,fipsstate,zip
0,AUTAUGA,AL,01001,01,36749
1,AUTAUGA,AL,01001,01,36068
2,AUTAUGA,AL,01001,01,36067
3,AUTAUGA,AL,01001,01,36066
4,AUTAUGA,AL,01001,01,36051
...,...,...,...,...,...
39980,VIEQUES,PR,72147,72,00765
39981,VILLALBA,PR,72149,72,00766
39982,YABUCOA,PR,72151,72,00767
39983,YAUCO,PR,72153,72,00698


In [42]:
df3 = pd.merge(df2, zip_cty[['county', 'fipscounty']], left_on = 'nppes_provider_zip5', right_on = 'fipscounty', how = 'inner')
df3

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,generic_name,opioid_drug_flag,long_acting_opioid_drug_flag,antibiotic_drug_flag,antipsychotic_drug_flag,county,fipscounty
0,1.003013e+09,GRABENSTEIN,WILLIAM,P,M.D.,M,I,1822 MEMORIAL DR,,CLARKSVILLE,...,84.0,2520.0,896.34,FLUTICASONE PROPIONATE,N,N,N,N,CLAY,37043
1,1.003013e+09,GRABENSTEIN,WILLIAM,P,M.D.,M,I,1822 MEMORIAL DR,,CLARKSVILLE,...,84.0,2520.0,896.34,FLUTICASONE PROPIONATE,N,N,N,N,CLAY,37043
2,1.003013e+09,GRABENSTEIN,WILLIAM,P,M.D.,M,I,1822 MEMORIAL DR,,CLARKSVILLE,...,84.0,2520.0,896.34,FLUTICASONE PROPIONATE,N,N,N,N,CLAY,37043
3,1.003013e+09,GRABENSTEIN,WILLIAM,P,M.D.,M,I,1822 MEMORIAL DR,,CLARKSVILLE,...,12.0,360.0,239.21,CELECOXIB,N,N,N,N,CLAY,37043
4,1.003013e+09,GRABENSTEIN,WILLIAM,P,M.D.,M,I,1822 MEMORIAL DR,,CLARKSVILLE,...,12.0,360.0,239.21,CELECOXIB,N,N,N,N,CLAY,37043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
783162,1.992979e+09,CHILDS,ADAM,M,M.D.,M,I,2933 MEDICAL CENTER PKWY,SUITE A,MURFREESBORO,...,,,,CITALOPRAM HYDROBROMIDE,N,N,N,N,NEW HANOVER,37129
783163,1.992979e+09,CHILDS,ADAM,M,M.D.,M,I,2933 MEDICAL CENTER PKWY,SUITE A,MURFREESBORO,...,,,,CITALOPRAM HYDROBROMIDE,N,N,N,N,NEW HANOVER,37129
783164,1.992979e+09,CHILDS,ADAM,M,M.D.,M,I,2933 MEDICAL CENTER PKWY,SUITE A,MURFREESBORO,...,,,,CITALOPRAM HYDROBROMIDE,N,N,N,N,NEW HANOVER,37129
783165,1.992979e+09,CHILDS,ADAM,M,M.D.,M,I,2933 MEDICAL CENTER PKWY,SUITE A,MURFREESBORO,...,,,,CITALOPRAM HYDROBROMIDE,N,N,N,N,NEW HANOVER,37129


In [43]:
df4 = df3[['npi', 'county', 'opioid_drug_flag', 'drug_name']]
df4

Unnamed: 0,npi,county,opioid_drug_flag,drug_name
0,1.003013e+09,CLAY,N,FLUTICASONE PROPIONATE
1,1.003013e+09,CLAY,N,FLUTICASONE PROPIONATE
2,1.003013e+09,CLAY,N,FLUTICASONE PROPIONATE
3,1.003013e+09,CLAY,N,CELECOXIB
4,1.003013e+09,CLAY,N,CELECOXIB
...,...,...,...,...
783162,1.992979e+09,NEW HANOVER,N,CITALOPRAM HBR
783163,1.992979e+09,NEW HANOVER,N,CITALOPRAM HBR
783164,1.992979e+09,NEW HANOVER,N,CITALOPRAM HBR
783165,1.992979e+09,NEW HANOVER,N,CITALOPRAM HBR


In [44]:
df5 = df4.groupby('county')['opioid_drug_flag'].value_counts('npi')
df5

county      opioid_drug_flag
ADAMS       N                   0.976125
            Y                   0.023875
BEAUFORT    N                   0.810526
            Y                   0.189474
BERTIE      N                   0.961437
                                  ...   
WASHINGTON  Y                   0.020833
WELLS       N                   0.935144
            Y                   0.064856
WILLIAMS    N                   0.968127
            Y                   0.031873
Name: proportion, Length: 90, dtype: float64

In [45]:
npi_claims = pd.read_csv('..\data\claims_npi.csv')

  npi_claims = pd.read_csv('..\data\claims_npi.csv')


In [46]:
npi_claims

Unnamed: 0,npi,opioid_claims,total_claims,opioid_percentage
0,1912011792,9275,52345,17.718980
1,1891915047,8405,11286,74.472798
2,1447608211,7274,10396,69.969219
3,1538428230,5709,8208,69.554094
4,1962406793,5607,8839,63.434778
...,...,...,...,...
20587,1518998152,0,45,0.000000
20588,1538283007,0,12,0.000000
20589,1518995208,0,887,0.000000
20590,1518991298,0,26,0.000000


In [47]:
zip_claims = pd.read_csv('..\data\zip_claims.csv')
zip_claims

  zip_claims = pd.read_csv('..\data\zip_claims.csv')


Unnamed: 0,nppes_provider_zip5,opioid_claims,total_claims,opioid_percentage
0,37802,586,803,72.976339
1,37391,80,117,68.376068
2,37088,1378,2146,64.212488
3,38137,22,35,62.857143
4,37318,101,168,60.119048
...,...,...,...,...
417,75035,0,12,0.000000
418,75080,0,15,0.000000
419,76904,0,1527,0.000000
420,77079,0,39,0.000000


In [48]:
zip_claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   nppes_provider_zip5  422 non-null    int64  
 1   opioid_claims        422 non-null    int64  
 2   total_claims         422 non-null    int64  
 3   opioid_percentage    422 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 13.3 KB


In [49]:
zip_cty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39985 entries, 0 to 39984
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   county      39985 non-null  object
 1   state       39985 non-null  object
 2   fipscounty  39985 non-null  object
 3   fipsstate   39985 non-null  object
 4   zip         39932 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


In [50]:
zip_cty = zip_cty.dropna(how='any',axis=0)
zip_cty['zip'] = zip_cty['zip'].astype(int)
zip_cty.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39932 entries, 0 to 39983
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   county      39932 non-null  object
 1   state       39932 non-null  object
 2   fipscounty  39932 non-null  object
 3   fipsstate   39932 non-null  object
 4   zip         39932 non-null  int32 
dtypes: int32(1), object(4)
memory usage: 1.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zip_cty['zip'] = zip_cty['zip'].astype(int)


In [51]:
zip_claims2 = pd.merge(zip_claims, zip_cty, left_on = 'nppes_provider_zip5', right_on = 'zip', how = 'left')

In [52]:
zip_claims2

Unnamed: 0,nppes_provider_zip5,opioid_claims,total_claims,opioid_percentage,county,state,fipscounty,fipsstate,zip
0,37802,586,803,72.976339,BLOUNT,TN,47009,47,37802.0
1,37391,80,117,68.376068,POLK,TN,47139,47,37391.0
2,37088,1378,2146,64.212488,WILSON,TN,47189,47,37088.0
3,38137,22,35,62.857143,SHELBY,TN,47157,47,38137.0
4,37318,101,168,60.119048,FRANKLIN,TN,47051,47,37318.0
...,...,...,...,...,...,...,...,...,...
417,75035,0,12,0.000000,COLLIN,TX,48085,48,75035.0
418,75080,0,15,0.000000,DALLAS,TX,48113,48,75080.0
419,76904,0,1527,0.000000,TOM GREEN,TX,48451,48,76904.0
420,77079,0,39,0.000000,HARRIS,TX,48201,48,77079.0


In [53]:
people['npi'] = people['npi'].astype(int)

In [54]:
npi_claims2 = pd.merge(npi_claims, people[['npi', 'nppes_provider_last_org_name',	'nppes_provider_first_name']], on = 'npi', how = 'left')
npi_claims2


Unnamed: 0,npi,opioid_claims,total_claims,opioid_percentage,nppes_provider_last_org_name,nppes_provider_first_name
0,1912011792,9275,52345,17.718980,COFFEY,DAVID
1,1891915047,8405,11286,74.472798,KINDRICK,JUSTIN
2,1447608211,7274,10396,69.969219,CATHERS,SHARON
3,1538428230,5709,8208,69.554094,PAINTER,MICHELLE
4,1962406793,5607,8839,63.434778,CLARK,RICHARD
...,...,...,...,...,...,...
20587,1518998152,0,45,0.000000,GOENKA,SEEMA
20588,1538283007,0,12,0.000000,CROSHAW,RANDAL
20589,1518995208,0,887,0.000000,TURNER,KEVIN
20590,1518991298,0,26,0.000000,GURNEY,INGRID


In [55]:
npi_claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20592 entries, 0 to 20591
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   npi                20592 non-null  int64  
 1   opioid_claims      20592 non-null  int64  
 2   total_claims       20592 non-null  int64  
 3   opioid_percentage  20592 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 643.6 KB


In [56]:
od_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   overdose_deaths  380 non-null    int64
 1   year             380 non-null    int64
 2   fipscounty       380 non-null    int64
dtypes: int64(3)
memory usage: 9.0 KB


In [68]:
od_df['fipscounty'] = od_df['fipscounty'].astype(int)
zip_cty['fipscounty'] = zip_cty['fipscounty'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zip_cty['fipscounty'] = zip_cty['fipscounty'].astype(int)


In [70]:
zip_cty.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39932 entries, 0 to 39983
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   county      39932 non-null  object
 1   state       39932 non-null  object
 2   fipscounty  39932 non-null  int32 
 3   fipsstate   39932 non-null  object
 4   zip         39932 non-null  int32 
dtypes: int32(2), object(3)
memory usage: 1.5+ MB


In [72]:
od_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   overdose_deaths  380 non-null    int64
 1   year             380 non-null    int64
 2   fipscounty       380 non-null    int32
dtypes: int32(1), int64(2)
memory usage: 7.6 KB


In [120]:
od_df2 = pd.merge(od_df, zip_cty[['fipscounty','state']], on = 'fipscounty', how = 'left')
od_df2

Unnamed: 0,overdose_deaths,year,fipscounty,state
0,135,2015,47157,TN
1,135,2015,47157,TN
2,135,2015,47157,TN
3,135,2015,47157,TN
4,135,2015,47157,TN
...,...,...,...,...
3035,2,2016,47007,TN
3036,2,2017,47007,TN
3037,2,2017,47007,TN
3038,3,2018,47007,TN


In [134]:
od_df2 = od_df2.drop_duplicates()
od_df2

Unnamed: 0,overdose_deaths,year,fipscounty,state
0,135,2015,47157,TN
68,150,2016,47157,TN
136,159,2017,47157,TN
204,123,2018,47157,TN
272,122,2015,47093,TN
...,...,...,...,...
3019,0,2018,47017,TN
3032,1,2015,47007,TN
3034,2,2016,47007,TN
3036,2,2017,47007,TN


In [160]:
od_df3 = od_df2.groupby('year')['overdose_deaths'].sum().reset_index()
od_df3

Unnamed: 0,year,overdose_deaths
0,2015,1033
1,2016,1186
2,2017,1267
3,2018,1304
