# Final Prep for MSA Data

# Setup

## Imports

In [1]:
import pandas as pd
import numpy as np
import re

## Parameters

In [2]:
MSA_CRIME_DATASET = "../../../data/RQ3/processed/msa_crime_df.csv"
MSA_POP_DEN_DATASET = "../../../data/RQ3/processed/msa_pop_den_df.csv"
DIABETES_COUNTYCITY_DATASET = "../../../data/RQ3/processed/diabetes_by_countyCity_df.csv"

MSA_FINAL_DATASET = "../../../data/RQ3/processed/msa_final_df.csv"


# Loading Datasets

In [3]:
msa_crime_df = pd.read_csv(MSA_CRIME_DATASET)
msa_pop_den_df = pd.read_csv(MSA_POP_DEN_DATASET)
diabetes_by_countyCity_df = pd.read_csv(DIABETES_COUNTYCITY_DATASET)

In [4]:
msa_crime_df.shape

(2175, 4)

# Create first major dataset

### Map MSA's to Population, Violent Crime rate, and murder and nonnegligent manslaughter

In [5]:
msa_crime_df.head(20)

Unnamed: 0,MSA,County/City,Population,Murder and nonnegligent manslaughter
0,"Abilene, TX M.S.A.",,170417.0,
1,,"Includes Callahan, Jones, and Taylor Counties",,
2,,City of Abilene,122480.0,8.0
3,,Total area actually reporting,1.0,9.0
4,,"Rate per 100,000 inhabitants",,5.3
5,"Akron, OH M.S.A.",,704283.0,
6,,Includes Portage3 and Summit Counties,,
7,,City of Akron,197690.0,38.0
8,,Total area actually reporting,0.962,45.0
9,,Estimated total,1.0,45.0


In [6]:
msa_crime_df['County/City']

0                                                 NaN
1       Includes Callahan, Jones, and Taylor Counties
2                                     City of Abilene
3                       Total area actually reporting
4                        Rate per 100,000 inhabitants
                            ...                      
2170                                              NaN
2171                             Includes Yuma County
2172                                     City of Yuma
2173                    Total area actually reporting
2174                     Rate per 100,000 inhabitants
Name: County/City, Length: 2175, dtype: object

In [7]:
#Should only be one null gap for every MSA
msa_crime_df['County/City'].isnull().sum()

364

In [8]:
rates_df = msa_crime_df[msa_crime_df['County/City'].notnull()]
rates_df = rates_df.drop(['MSA','Population'], axis=1)
rates_df = rates_df[rates_df['County/City'].str.contains("Rate per")]


In [9]:
rates_df.head()

Unnamed: 0,County/City,Murder and nonnegligent manslaughter
4,"Rate per 100,000 inhabitants",5.3
10,"Rate per 100,000 inhabitants",6.4
16,"Rate per 100,000 inhabitants",11.6
24,"Rate per 100,000 inhabitants",2.2
29,"Rate per 100,000 inhabitants",9.5


In [10]:
msa_pop_df = msa_crime_df[msa_crime_df['MSA'].notnull()]
msa_pop_df = msa_pop_df.drop(['County/City', 'Murder and nonnegligent manslaughter'], axis=1)
msa_pop_df = msa_pop_df.reset_index(drop=True)
msa_pop_df

Unnamed: 0,MSA,Population
0,"Abilene, TX M.S.A.",170417
1,"Akron, OH M.S.A.",704283
2,"Albany, GA M.S.A.",147142
3,"Albany-Schenectady-Troy, NY M.S.A.",871741
4,"Albuquerque, NM M.S.A.",915468
...,...,...
359,"Worcester, MA-CT M.S.A.",871779
360,"Yakima, WA M.S.A.",252019
361,"York-Hanover, PA M.S.A.",447168
362,"Yuba City, CA M.S.A.",173299


In [11]:
rates_df = msa_crime_df[msa_crime_df['County/City'].notnull()]
rates_df = rates_df.drop(['MSA','Population'], axis=1)
rates_df = rates_df[rates_df['County/City'].str.contains("Total area")]
rates_df = rates_df.reset_index(drop=True)
rates_df

Unnamed: 0,County/City,Murder and nonnegligent manslaughter
0,Total area actually reporting,9
1,Total area actually reporting,45
2,Total area actually reporting,17
3,Total area actually reporting,19
4,Total area actually reporting,87
...,...,...
359,Total area actually reporting,11
360,Total area actually reporting,25
361,Total area actually reporting,24
362,Total area actually reporting,8


In [12]:
#Now we need to merge datasets
msa_pop_rates_df = pd.DataFrame([msa_pop_df['MSA'], msa_pop_df['Population'], rates_df['Murder and nonnegligent manslaughter']]).transpose()
msa_pop_rates_df

Unnamed: 0,MSA,Population,Murder and nonnegligent manslaughter
0,"Abilene, TX M.S.A.",170417,9
1,"Akron, OH M.S.A.",704283,45
2,"Albany, GA M.S.A.",147142,17
3,"Albany-Schenectady-Troy, NY M.S.A.",871741,19
4,"Albuquerque, NM M.S.A.",915468,87
...,...,...,...
359,"Worcester, MA-CT M.S.A.",871779,11
360,"Yakima, WA M.S.A.",252019,25
361,"York-Hanover, PA M.S.A.",447168,24
362,"Yuba City, CA M.S.A.",173299,8


Checking for missing/bad data

In [13]:
msa_pop_rates_df.isnull().sum()

MSA                                     0
Population                              0
Murder and nonnegligent manslaughter    0
dtype: int64

# Create second major dataset

### Map MSA's to Counties

In [14]:
#create msa series/list to create new dataframe
MSA_list = msa_crime_df['MSA'].dropna().copy().tolist()
MSA_list

['Abilene, TX M.S.A.',
 'Akron, OH M.S.A.',
 'Albany, GA M.S.A.',
 'Albany-Schenectady-Troy, NY M.S.A.',
 'Albuquerque, NM M.S.A.',
 'Alexandria, LA M.S.A.',
 'Allentown-Bethlehem-Easton, PA-NJ M.S.A.',
 'Altoona, PA M.S.A.',
 'Amarillo, TX M.S.A.',
 'Anchorage, AK M.S.A.',
 'Ann Arbor, MI M.S.A.',
 'Anniston-Oxford, AL M.S.A.',
 'Appleton, WI M.S.A.',
 'Asheville, NC M.S.A.',
 'Atlanta-Sandy Springs-Alpharetta, GA M.S.A.',
 'Atlantic City-Hammonton, NJ M.S.A.',
 'Auburn-Opelika, AL M.S.A',
 'Augusta-Richmond County, GA-SC M.S.A.',
 'Austin-Round Rock-Georgetown, TX M.S.A.',
 'Bakersfield, CA M.S.A.',
 'Baltimore-Columbia-Towson, MD M.S.A.',
 'Bangor, ME M.S.A.',
 'Barnstable Town, MA M.S.A.',
 'Baton Rouge, LA M.S.A.',
 'Battle Creek, MI M.S.A',
 'Bay City, MI M.S.A.',
 'Beaumont-Port Arthur, TX M.S.A.',
 'Beckley, WV M.S.A',
 'Bellingham, WA M.S.A',
 'Bend, OR M.S.A.',
 'Billings, MT M.S.A.',
 'Binghamton, NY M.S.A.',
 'Bismarck, ND M.S.A.',
 'Blacksburg-Christiansburg, VA M.S.A.',
 

In [15]:
#Here we can take a look at the counties column to see what corrections need to be made
county_list = msa_crime_df['County/City'].copy().dropna().tolist()
county_list

['Includes Callahan, Jones, and Taylor Counties',
 'City of Abilene',
 'Total area actually reporting',
 'Rate per 100,000 inhabitants',
 'Includes Portage3 and Summit Counties',
 'City of Akron',
 'Total area actually reporting',
 'Estimated total',
 'Rate per 100,000 inhabitants',
 'Includes Dougherty, Lee, Terrell, and Worth Counties',
 'City of Albany',
 'Total area actually reporting',
 'Estimated total',
 'Rate per 100,000 inhabitants',
 'Includes Albany, Rensselaer, Saratoga, Schenectady, and Schoharie Counties',
 'City of Albany',
 'City of Schenectady',
 'City of Troy',
 'Total area actually reporting',
 'Estimated total',
 'Rate per 100,000 inhabitants',
 'Includes Bernalillo, Sandoval, Torrance, and Valencia Counties',
 'City of Albuquerque2',
 'Total area actually reporting',
 'Rate per 100,000 inhabitants',
 'Includes Grant and Rapides Parishes',
 'City of Alexandria',
 'Total area actually reporting',
 'Estimated total',
 'Rate per 100,000 inhabitants',
 'Includes Warren 

Here we can see many rows not related to counties of MSA's. We can also see the remenants of superscripts that need to be removed.

In [16]:
#Here we remove any information not related to counties
# Here show that the 'Includes' values have their superscripts removed
msa_crime_df['County/City'] = msa_crime_df['County/City'].str.replace('\d+', '', regex=True)

county_list = msa_crime_df['County/City'].copy().dropna().tolist()

tmp = []
for i in county_list:
    if "City of" not in i and "Total area" not in i and "Rate per" not in i and "Estimated total" not in i:
        tmp.append(i)

county_list = tmp
county_list

['Includes Callahan, Jones, and Taylor Counties',
 'Includes Portage and Summit Counties',
 'Includes Dougherty, Lee, Terrell, and Worth Counties',
 'Includes Albany, Rensselaer, Saratoga, Schenectady, and Schoharie Counties',
 'Includes Bernalillo, Sandoval, Torrance, and Valencia Counties',
 'Includes Grant and Rapides Parishes',
 'Includes Warren County, NJ and Carbon, Lehigh, and Northampton Counties, PA',
 'Includes Blair County',
 'Includes Armstrong, Carson, Oldham, Potter, and Randall Counties',
 'Includes Anchorage Municipality and Matanuska-Susitna Borough',
 'Includes Washtenaw County',
 'Includes Calhoun County',
 'Includes Calumet and Outagamie Counties',
 'Includes Buncombe, Haywood, Henderson, and Madison Counties',
 'Includes Barrow, Bartow, Butts, Carroll, Cherokee, Clayton, Cobb, Coweta, Dawson, DeKalb, Douglas, Fayette, Forsyth, Fulton, Gwinnett, Haralson, Heard, Henry, Jasper, Lamar, Meriwether, Morgan, Newton, Paulding, Pickens, Pike, Rockdale, Spalding, and Walton

In [17]:
#Check if the number of remaining items in the list the same as the number of metropolitan areas?
print(len(MSA_list), len(county_list))

364 364


In [18]:
#Create basic dataframe from MSA_list and county_list
MSA_to_counties_cities_df = pd.DataFrame(list(zip(MSA_list, county_list)), columns = ['MSA', 'County/City'])
MSA_to_counties_cities_df

Unnamed: 0,MSA,County/City
0,"Abilene, TX M.S.A.","Includes Callahan, Jones, and Taylor Counties"
1,"Akron, OH M.S.A.",Includes Portage and Summit Counties
2,"Albany, GA M.S.A.","Includes Dougherty, Lee, Terrell, and Worth Co..."
3,"Albany-Schenectady-Troy, NY M.S.A.","Includes Albany, Rensselaer, Saratoga, Schenec..."
4,"Albuquerque, NM M.S.A.","Includes Bernalillo, Sandoval, Torrance, and V..."
...,...,...
359,"Worcester, MA-CT M.S.A.","Includes Windham County, CT and Worcester Coun..."
360,"Yakima, WA M.S.A.",Includes Yakima County
361,"York-Hanover, PA M.S.A.",Includes York County
362,"Yuba City, CA M.S.A.",Includes Sutter and Yuba Counties


### Process Counties/Cities to isolate counties and cities for each MSA

#### Single State MSA's

In [19]:
#Identify single state MSA's
singleState_countiesCities_MSA_df = MSA_to_counties_cities_df[~MSA_to_counties_cities_df['MSA'].str.contains("[A-Z][A-Z]-[A-Z][A-Z]")]
singleState_countiesCities_MSA_df['MSA'].tolist()

['Abilene, TX M.S.A.',
 'Akron, OH M.S.A.',
 'Albany, GA M.S.A.',
 'Albany-Schenectady-Troy, NY M.S.A.',
 'Albuquerque, NM M.S.A.',
 'Alexandria, LA M.S.A.',
 'Altoona, PA M.S.A.',
 'Amarillo, TX M.S.A.',
 'Anchorage, AK M.S.A.',
 'Ann Arbor, MI M.S.A.',
 'Anniston-Oxford, AL M.S.A.',
 'Appleton, WI M.S.A.',
 'Asheville, NC M.S.A.',
 'Atlanta-Sandy Springs-Alpharetta, GA M.S.A.',
 'Atlantic City-Hammonton, NJ M.S.A.',
 'Auburn-Opelika, AL M.S.A',
 'Austin-Round Rock-Georgetown, TX M.S.A.',
 'Bakersfield, CA M.S.A.',
 'Baltimore-Columbia-Towson, MD M.S.A.',
 'Bangor, ME M.S.A.',
 'Barnstable Town, MA M.S.A.',
 'Baton Rouge, LA M.S.A.',
 'Battle Creek, MI M.S.A',
 'Bay City, MI M.S.A.',
 'Beaumont-Port Arthur, TX M.S.A.',
 'Beckley, WV M.S.A',
 'Bellingham, WA M.S.A',
 'Bend, OR M.S.A.',
 'Billings, MT M.S.A.',
 'Binghamton, NY M.S.A.',
 'Bismarck, ND M.S.A.',
 'Blacksburg-Christiansburg, VA M.S.A.',
 'Bloomington, IL M.S.A.',
 'Bloomington, IN M.S.A.',
 'Bloomsburg-Berwick, PA M.S.A',
 

Now we need to map state codes to states and make a new column with states for both single state MSA's and then multi state MSA's

#### Divide between counties only and cities and process separately

##### Single State Counties only

In [20]:
#Process single state MSA
singleState_countiesCities_MSA_df

Unnamed: 0,MSA,County/City
0,"Abilene, TX M.S.A.","Includes Callahan, Jones, and Taylor Counties"
1,"Akron, OH M.S.A.",Includes Portage and Summit Counties
2,"Albany, GA M.S.A.","Includes Dougherty, Lee, Terrell, and Worth Co..."
3,"Albany-Schenectady-Troy, NY M.S.A.","Includes Albany, Rensselaer, Saratoga, Schenec..."
4,"Albuquerque, NM M.S.A.","Includes Bernalillo, Sandoval, Torrance, and V..."
...,...,...
357,"Wilmington, NC M.S.A.",Includes New Hanover and Pender Counties
360,"Yakima, WA M.S.A.",Includes Yakima County
361,"York-Hanover, PA M.S.A.",Includes York County
362,"Yuba City, CA M.S.A.",Includes Sutter and Yuba Counties


We need to find the rows with cities first. Then we can identify the county-only rows.

In [21]:
#Searching for keyword 'city'
singleState_cities_MSA_df = singleState_countiesCities_MSA_df[singleState_countiesCities_MSA_df['County/City'].str.contains("City")]
singleState_cities_MSA_df

Unnamed: 0,MSA,County/City
20,"Baltimore-Columbia-Towson, MD M.S.A.","Includes Anne Arundel, Baltimore, Carroll, Har..."
33,"Blacksburg-Christiansburg, VA M.S.A.","Includes Giles, Montgomery, and Pulaski Counti..."
54,"Carson City, NV M.S.A.",Includes Carson City
61,"Charlottesville, VA M.S.A.","Includes Albemarle, Buckingham, Fluvanna, Gree..."
140,"Harrisonburg, VA M.S.A.",Includes Rockingham County and Harrisonburg City
194,"Lynchburg, VA M.S.A.","Includes Amherst, Appomattox, Bedford, and Cam..."
272,"Richmond, VA M.S.A.","Includes Amelia, Charles City, Chesterfield, D..."
288,"San Francisco-Oakland-Berkeley, CA M.S.A.",Includes the Metropolitan Divisions of Oakland...


In [22]:
#Searching for keyword 'cities'
singleState_cities_MSA_df2 = singleState_countiesCities_MSA_df[singleState_countiesCities_MSA_df['County/City'].str.contains("Cities")]
singleState_cities_MSA_df2

Unnamed: 0,MSA,County/City
272,"Richmond, VA M.S.A.","Includes Amelia, Charles City, Chesterfield, D..."
274,"Roanoke, VA M.S.A.","Includes Botetourt, Craig, Franklin, and Roano..."
316,"Staunton, VA M.S.A.",Includes Augusta County and Staunton and Wayne...


In [23]:
#Join the two 'city' dataframes to get all non county-only entries
singleState_cities_MSA_df = pd.concat([singleState_cities_MSA_df, singleState_cities_MSA_df2]).drop_duplicates()
singleState_cities_MSA_df

Unnamed: 0,MSA,County/City
20,"Baltimore-Columbia-Towson, MD M.S.A.","Includes Anne Arundel, Baltimore, Carroll, Har..."
33,"Blacksburg-Christiansburg, VA M.S.A.","Includes Giles, Montgomery, and Pulaski Counti..."
54,"Carson City, NV M.S.A.",Includes Carson City
61,"Charlottesville, VA M.S.A.","Includes Albemarle, Buckingham, Fluvanna, Gree..."
140,"Harrisonburg, VA M.S.A.",Includes Rockingham County and Harrisonburg City
194,"Lynchburg, VA M.S.A.","Includes Amherst, Appomattox, Bedford, and Cam..."
272,"Richmond, VA M.S.A.","Includes Amelia, Charles City, Chesterfield, D..."
288,"San Francisco-Oakland-Berkeley, CA M.S.A.",Includes the Metropolitan Divisions of Oakland...
274,"Roanoke, VA M.S.A.","Includes Botetourt, Craig, Franklin, and Roano..."
316,"Staunton, VA M.S.A.",Includes Augusta County and Staunton and Wayne...


In [24]:
#Get a dataframe with just the single state counties
cities_df = singleState_countiesCities_MSA_df.index.isin([20,33,54,61,140,194,272,274,288,316])

singleState_counties_df = singleState_countiesCities_MSA_df[~cities_df]
singleState_counties_df

Unnamed: 0,MSA,County/City
0,"Abilene, TX M.S.A.","Includes Callahan, Jones, and Taylor Counties"
1,"Akron, OH M.S.A.",Includes Portage and Summit Counties
2,"Albany, GA M.S.A.","Includes Dougherty, Lee, Terrell, and Worth Co..."
3,"Albany-Schenectady-Troy, NY M.S.A.","Includes Albany, Rensselaer, Saratoga, Schenec..."
4,"Albuquerque, NM M.S.A.","Includes Bernalillo, Sandoval, Torrance, and V..."
...,...,...
357,"Wilmington, NC M.S.A.",Includes New Hanover and Pender Counties
360,"Yakima, WA M.S.A.",Includes Yakima County
361,"York-Hanover, PA M.S.A.",Includes York County
362,"Yuba City, CA M.S.A.",Includes Sutter and Yuba Counties


In [25]:
singleState_counties_df['County/City'] = singleState_counties_df['County/City'].str.replace("Includes", "")
singleState_counties_df['County/City'] = singleState_counties_df['County/City'].str.replace("Counties", "")
singleState_counties_df['County/City'] = singleState_counties_df['County/City'].str.replace("County", "")
singleState_counties_df['County/City'] = singleState_counties_df['County/City'].str.replace(" and ", ",")

singleState_counties_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  singleState_counties_df['County/City'] = singleState_counties_df['County/City'].str.replace("Includes", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  singleState_counties_df['County/City'] = singleState_counties_df['County/City'].str.replace("Counties", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

Unnamed: 0,MSA,County/City
0,"Abilene, TX M.S.A.","Callahan, Jones,,Taylor"
1,"Akron, OH M.S.A.","Portage,Summit"
2,"Albany, GA M.S.A.","Dougherty, Lee, Terrell,,Worth"
3,"Albany-Schenectady-Troy, NY M.S.A.","Albany, Rensselaer, Saratoga, Schenectady,,Sc..."
4,"Albuquerque, NM M.S.A.","Bernalillo, Sandoval, Torrance,,Valencia"
...,...,...
357,"Wilmington, NC M.S.A.","New Hanover,Pender"
360,"Yakima, WA M.S.A.",Yakima
361,"York-Hanover, PA M.S.A.",York
362,"Yuba City, CA M.S.A.","Sutter,Yuba"


In [26]:
singleState_counties_list = singleState_counties_df['County/City'].tolist()
singleState_counties_list

[' Callahan, Jones,,Taylor ',
 ' Portage,Summit ',
 ' Dougherty, Lee, Terrell,,Worth ',
 ' Albany, Rensselaer, Saratoga, Schenectady,,Schoharie ',
 ' Bernalillo, Sandoval, Torrance,,Valencia ',
 ' Grant,Rapides Parishes',
 ' Blair ',
 ' Armstrong, Carson, Oldham, Potter,,Randall ',
 ' Anchorage Municipality,Matanuska-Susitna Borough',
 ' Washtenaw ',
 ' Calhoun ',
 ' Calumet,Outagamie ',
 ' Buncombe, Haywood, Henderson,,Madison ',
 ' Barrow, Bartow, Butts, Carroll, Cherokee, Clayton, Cobb, Coweta, Dawson, DeKalb, Douglas, Fayette, Forsyth, Fulton, Gwinnett, Haralson, Heard, Henry, Jasper, Lamar, Meriwether, Morgan, Newton, Paulding, Pickens, Pike, Rockdale, Spalding,,Walton ',
 ' Atlantic ',
 ' Lee ',
 ' Bastrop, Caldwell, Hays, Travis,,Williamson ',
 ' Kern ',
 ' Penobscot ',
 ' Barnstable ',
 ' Ascension, Assumption, East Baton Rouge, East Feliciana, Iberville, Livingston, Pointe Coupee, St. Helena, West Baton Rouge,,West Feliciana Parishes',
 ' Calhoun ',
 ' Bay ',
 ' Hardin, Jeffer

So now for every entry in countiesOnlyList, put each list item in a new list as a list of items and get rid of empty values

In [27]:
#Here we clean singleState_counties_list and put all the counties into a lists within a list

i = 0
lenghtOfList = len(singleState_counties_list)

while i < lenghtOfList:
    #Convert list entries into lists themselves split by ',' and remove empty entries
    singleState_counties_list[i] = singleState_counties_list[i].split(",") 
    singleState_counties_list[i] = list(filter(None, singleState_counties_list[i]))
    
    #remove whitespaces before and after individual county names
    n = 0
    while n < len(singleState_counties_list[i]):
        singleState_counties_list[i][n] = singleState_counties_list[i][n].strip()
        n = n + 1
        
    i = i + 1
    
singleState_counties_list

[['Callahan', 'Jones', 'Taylor'],
 ['Portage', 'Summit'],
 ['Dougherty', 'Lee', 'Terrell', 'Worth'],
 ['Albany', 'Rensselaer', 'Saratoga', 'Schenectady', 'Schoharie'],
 ['Bernalillo', 'Sandoval', 'Torrance', 'Valencia'],
 ['Grant', 'Rapides Parishes'],
 ['Blair'],
 ['Armstrong', 'Carson', 'Oldham', 'Potter', 'Randall'],
 ['Anchorage Municipality', 'Matanuska-Susitna Borough'],
 ['Washtenaw'],
 ['Calhoun'],
 ['Calumet', 'Outagamie'],
 ['Buncombe', 'Haywood', 'Henderson', 'Madison'],
 ['Barrow',
  'Bartow',
  'Butts',
  'Carroll',
  'Cherokee',
  'Clayton',
  'Cobb',
  'Coweta',
  'Dawson',
  'DeKalb',
  'Douglas',
  'Fayette',
  'Forsyth',
  'Fulton',
  'Gwinnett',
  'Haralson',
  'Heard',
  'Henry',
  'Jasper',
  'Lamar',
  'Meriwether',
  'Morgan',
  'Newton',
  'Paulding',
  'Pickens',
  'Pike',
  'Rockdale',
  'Spalding',
  'Walton'],
 ['Atlantic'],
 ['Lee'],
 ['Bastrop', 'Caldwell', 'Hays', 'Travis', 'Williamson'],
 ['Kern'],
 ['Penobscot'],
 ['Barnstable'],
 ['Ascension',
  'Assum

In [28]:
singleState_counties_MSA_list = singleState_counties_df['MSA'].tolist()
singleState_counties_MSA_list

['Abilene, TX M.S.A.',
 'Akron, OH M.S.A.',
 'Albany, GA M.S.A.',
 'Albany-Schenectady-Troy, NY M.S.A.',
 'Albuquerque, NM M.S.A.',
 'Alexandria, LA M.S.A.',
 'Altoona, PA M.S.A.',
 'Amarillo, TX M.S.A.',
 'Anchorage, AK M.S.A.',
 'Ann Arbor, MI M.S.A.',
 'Anniston-Oxford, AL M.S.A.',
 'Appleton, WI M.S.A.',
 'Asheville, NC M.S.A.',
 'Atlanta-Sandy Springs-Alpharetta, GA M.S.A.',
 'Atlantic City-Hammonton, NJ M.S.A.',
 'Auburn-Opelika, AL M.S.A',
 'Austin-Round Rock-Georgetown, TX M.S.A.',
 'Bakersfield, CA M.S.A.',
 'Bangor, ME M.S.A.',
 'Barnstable Town, MA M.S.A.',
 'Baton Rouge, LA M.S.A.',
 'Battle Creek, MI M.S.A',
 'Bay City, MI M.S.A.',
 'Beaumont-Port Arthur, TX M.S.A.',
 'Beckley, WV M.S.A',
 'Bellingham, WA M.S.A',
 'Bend, OR M.S.A.',
 'Billings, MT M.S.A.',
 'Binghamton, NY M.S.A.',
 'Bismarck, ND M.S.A.',
 'Bloomington, IL M.S.A.',
 'Bloomington, IN M.S.A.',
 'Bloomsburg-Berwick, PA M.S.A',
 'Boise City, ID M.S.A.',
 'Boston, MA M.D.',
 'Cambridge-Newton-Framingham, MA M.D

In [29]:
print(len(singleState_counties_list), len(singleState_counties_MSA_list))

314 314


In [30]:
#We create two lists with corresponding MSA's and counties

singleState_counties_MSA_list_processed = []
singleState_counties_list_processed = []

i = 0
lenghtOfList = len(singleState_counties_MSA_list)

while i < lenghtOfList:
    n = 0
    while n < len(singleState_counties_list[i]):
        singleState_counties_MSA_list_processed.append(singleState_counties_MSA_list[i])
        singleState_counties_list_processed.append(singleState_counties_list[i][n])
        n = n + 1
        
    i = i + 1


In [31]:
#Create dataframe that joins MSA and individual Counties for single state counties only
singleState_counties_df = pd.DataFrame(columns = ['MSA', 'County/City'])

singleState_counties_df['MSA'] = pd.Series(singleState_counties_MSA_list_processed)

singleState_counties_df['County/City'] = pd.Series(singleState_counties_list_processed)

singleState_counties_df

Unnamed: 0,MSA,County/City
0,"Abilene, TX M.S.A.",Callahan
1,"Abilene, TX M.S.A.",Jones
2,"Abilene, TX M.S.A.",Taylor
3,"Akron, OH M.S.A.",Portage
4,"Akron, OH M.S.A.",Summit
...,...,...
739,"Yakima, WA M.S.A.",Yakima
740,"York-Hanover, PA M.S.A.",York
741,"Yuba City, CA M.S.A.",Sutter
742,"Yuba City, CA M.S.A.",Yuba


In [32]:
#Add 'County' to each county'
singleState_counties_df['County/City'] = singleState_counties_df['County/City'].astype(str) + ' County'
singleState_counties_df

Unnamed: 0,MSA,County/City
0,"Abilene, TX M.S.A.",Callahan County
1,"Abilene, TX M.S.A.",Jones County
2,"Abilene, TX M.S.A.",Taylor County
3,"Akron, OH M.S.A.",Portage County
4,"Akron, OH M.S.A.",Summit County
...,...,...
739,"Yakima, WA M.S.A.",Yakima County
740,"York-Hanover, PA M.S.A.",York County
741,"Yuba City, CA M.S.A.",Sutter County
742,"Yuba City, CA M.S.A.",Yuba County


### Now get states

In [33]:
#Store state codes for singleStates 
state_code_list = []

for MSA_address in singleState_counties_MSA_list_processed:
    state_code = re.findall("[A-Z][A-Z]", MSA_address)
    state_code_list.append(state_code[0])
    
state_code_list

['TX',
 'TX',
 'TX',
 'OH',
 'OH',
 'GA',
 'GA',
 'GA',
 'GA',
 'NY',
 'NY',
 'NY',
 'NY',
 'NY',
 'NM',
 'NM',
 'NM',
 'NM',
 'LA',
 'LA',
 'PA',
 'TX',
 'TX',
 'TX',
 'TX',
 'TX',
 'AK',
 'AK',
 'MI',
 'AL',
 'WI',
 'WI',
 'NC',
 'NC',
 'NC',
 'NC',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'GA',
 'NJ',
 'AL',
 'TX',
 'TX',
 'TX',
 'TX',
 'TX',
 'CA',
 'ME',
 'MA',
 'LA',
 'LA',
 'LA',
 'LA',
 'LA',
 'LA',
 'LA',
 'LA',
 'LA',
 'LA',
 'MI',
 'MI',
 'TX',
 'TX',
 'TX',
 'WV',
 'WV',
 'WA',
 'OR',
 'MT',
 'MT',
 'MT',
 'NY',
 'NY',
 'ND',
 'ND',
 'ND',
 'IL',
 'IN',
 'IN',
 'PA',
 'PA',
 'ID',
 'ID',
 'ID',
 'ID',
 'ID',
 'MA',
 'MA',
 'MA',
 'MA',
 'MA',
 'NH',
 'NH',
 'CO',
 'KY',
 'KY',
 'KY',
 'KY',
 'CT',
 'TX',
 'GA',
 'GA',
 'GA',
 'NY',
 'NY',
 'NC',
 'VT',
 'VT',
 'VT',
 'MD',
 'OH',
 'OH',
 'FL',
 'WY',
 'PA',
 'IL',
 'IL',

In [34]:
singleState_counties_df['state code'] = pd.Series(state_code_list)
singleState_counties_df

Unnamed: 0,MSA,County/City,state code
0,"Abilene, TX M.S.A.",Callahan County,TX
1,"Abilene, TX M.S.A.",Jones County,TX
2,"Abilene, TX M.S.A.",Taylor County,TX
3,"Akron, OH M.S.A.",Portage County,OH
4,"Akron, OH M.S.A.",Summit County,OH
...,...,...,...
739,"Yakima, WA M.S.A.",Yakima County,WA
740,"York-Hanover, PA M.S.A.",York County,PA
741,"Yuba City, CA M.S.A.",Sutter County,CA
742,"Yuba City, CA M.S.A.",Yuba County,CA


In [35]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

"""
Code Reference
Roger Allen (Mar 31 2021).Github Gist. 
Available at: https://gist.github.com/rogerallen/1583593 (Accessed 8/4/21).
"""




'\nCode Reference\nRoger Allen (Mar 31 2021).Github Gist. \nAvailable at: https://gist.github.com/rogerallen/1583593 (Accessed 8/4/21).\n'

In [36]:
#We need to invert this dictionary
state_dict = dict(map(reversed, us_state_abbrev.items()))
state_dict

{'AL': 'Alabama',
 'AK': 'Alaska',
 'AS': 'American Samoa',
 'AZ': 'Arizona',
 'AR': 'Arkansas',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'DC': 'District of Columbia',
 'FL': 'Florida',
 'GA': 'Georgia',
 'GU': 'Guam',
 'HI': 'Hawaii',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'IA': 'Iowa',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'ME': 'Maine',
 'MD': 'Maryland',
 'MA': 'Massachusetts',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MS': 'Mississippi',
 'MO': 'Missouri',
 'MT': 'Montana',
 'NE': 'Nebraska',
 'NV': 'Nevada',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NY': 'New York',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'MP': 'Northern Mariana Islands',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'PR': 'Puerto Rico',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'UT': 'Utah',
 'VT': 'Vermont',
 '

In [37]:
state_list = []

for state_code in state_code_list:
    state = state_dict[state_code]
    state_list.append(state)
    
state_list

['Texas',
 'Texas',
 'Texas',
 'Ohio',
 'Ohio',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'New York',
 'New York',
 'New York',
 'New York',
 'New York',
 'New Mexico',
 'New Mexico',
 'New Mexico',
 'New Mexico',
 'Louisiana',
 'Louisiana',
 'Pennsylvania',
 'Texas',
 'Texas',
 'Texas',
 'Texas',
 'Texas',
 'Alaska',
 'Alaska',
 'Michigan',
 'Alabama',
 'Wisconsin',
 'Wisconsin',
 'North Carolina',
 'North Carolina',
 'North Carolina',
 'North Carolina',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'New Jersey',
 'Alabama',
 'Texas',
 'Texas',
 'Texas',
 'Texas',
 'Texas',
 'California',
 'Maine',
 'Massachusetts',
 'Louisiana',
 'Louisiana',
 'Louisiana',
 'Louisiana',
 'Louisia

In [38]:
singleState_counties_df['State'] = pd.Series(state_list)
singleState_counties_df = singleState_counties_df.drop(['state code'], axis=1)

singleState_counties_df

Unnamed: 0,MSA,County/City,State
0,"Abilene, TX M.S.A.",Callahan County,Texas
1,"Abilene, TX M.S.A.",Jones County,Texas
2,"Abilene, TX M.S.A.",Taylor County,Texas
3,"Akron, OH M.S.A.",Portage County,Ohio
4,"Akron, OH M.S.A.",Summit County,Ohio
...,...,...,...
739,"Yakima, WA M.S.A.",Yakima County,Washington
740,"York-Hanover, PA M.S.A.",York County,Pennsylvania
741,"Yuba City, CA M.S.A.",Sutter County,California
742,"Yuba City, CA M.S.A.",Yuba County,California


#### Single State Cities

In [39]:
singleState_cities_MSA_df

Unnamed: 0,MSA,County/City
20,"Baltimore-Columbia-Towson, MD M.S.A.","Includes Anne Arundel, Baltimore, Carroll, Har..."
33,"Blacksburg-Christiansburg, VA M.S.A.","Includes Giles, Montgomery, and Pulaski Counti..."
54,"Carson City, NV M.S.A.",Includes Carson City
61,"Charlottesville, VA M.S.A.","Includes Albemarle, Buckingham, Fluvanna, Gree..."
140,"Harrisonburg, VA M.S.A.",Includes Rockingham County and Harrisonburg City
194,"Lynchburg, VA M.S.A.","Includes Amherst, Appomattox, Bedford, and Cam..."
272,"Richmond, VA M.S.A.","Includes Amelia, Charles City, Chesterfield, D..."
288,"San Francisco-Oakland-Berkeley, CA M.S.A.",Includes the Metropolitan Divisions of Oakland...
274,"Roanoke, VA M.S.A.","Includes Botetourt, Craig, Franklin, and Roano..."
316,"Staunton, VA M.S.A.",Includes Augusta County and Staunton and Wayne...


In [40]:
singleState_cities_list = singleState_cities_MSA_df['County/City'].tolist()
singleState_cities_list

["Includes Anne Arundel, Baltimore, Carroll, Harford, Howard, and Queen Anne's Counties and Baltimore City",
 'Includes Giles, Montgomery, and Pulaski Counties and Radford City',
 'Includes Carson City',
 'Includes Albemarle, Buckingham, Fluvanna, Greene, and Nelson Counties and Charlottesville City',
 'Includes Rockingham County and Harrisonburg City',
 'Includes Amherst, Appomattox, Bedford, and Campbell Counties and Lynchburg City',
 'Includes Amelia, Charles City, Chesterfield, Dinwiddie, Goochland, Hanover, Henrico, King and Queen, King William, New Kent, Powhatan, Prince George, and Sussex Counties and Colonial Heights, Hopewell, Petersburg, and Richmond Cities',
 'Includes the Metropolitan Divisions of Oakland-Berkeley-Livermore, San Francisco-San Mateo-Redwood City, and San Rafael',
 'Includes Botetourt, Craig, Franklin, and Roanoke Counties and Roanoke and Salem Cities',
 'Includes Augusta County and Staunton and Waynesboro Cities ']

In [41]:
singleState_cities_MSA_df['County/City'] = singleState_cities_MSA_df['County/City'].str.replace("Includes", "")
singleState_cities_MSA_df['County/City'] = singleState_cities_MSA_df['County/City'].str.replace(" and ", ",")
singleState_cities_MSA_df['County/City'].tolist()

[" Anne Arundel, Baltimore, Carroll, Harford, Howard,,Queen Anne's Counties,Baltimore City",
 ' Giles, Montgomery,,Pulaski Counties,Radford City',
 ' Carson City',
 ' Albemarle, Buckingham, Fluvanna, Greene,,Nelson Counties,Charlottesville City',
 ' Rockingham County,Harrisonburg City',
 ' Amherst, Appomattox, Bedford,,Campbell Counties,Lynchburg City',
 ' Amelia, Charles City, Chesterfield, Dinwiddie, Goochland, Hanover, Henrico, King,Queen, King William, New Kent, Powhatan, Prince George,,Sussex Counties,Colonial Heights, Hopewell, Petersburg,,Richmond Cities',
 ' the Metropolitan Divisions of Oakland-Berkeley-Livermore, San Francisco-San Mateo-Redwood City,,San Rafael',
 ' Botetourt, Craig, Franklin,,Roanoke Counties,Roanoke,Salem Cities',
 ' Augusta County,Staunton,Waynesboro Cities ']

In [42]:
singleState_cities_MSA_list = singleState_cities_MSA_df['MSA'].tolist()
singleState_cities_list = singleState_cities_MSA_df['County/City'].tolist()
singleState_cities_list

[" Anne Arundel, Baltimore, Carroll, Harford, Howard,,Queen Anne's Counties,Baltimore City",
 ' Giles, Montgomery,,Pulaski Counties,Radford City',
 ' Carson City',
 ' Albemarle, Buckingham, Fluvanna, Greene,,Nelson Counties,Charlottesville City',
 ' Rockingham County,Harrisonburg City',
 ' Amherst, Appomattox, Bedford,,Campbell Counties,Lynchburg City',
 ' Amelia, Charles City, Chesterfield, Dinwiddie, Goochland, Hanover, Henrico, King,Queen, King William, New Kent, Powhatan, Prince George,,Sussex Counties,Colonial Heights, Hopewell, Petersburg,,Richmond Cities',
 ' the Metropolitan Divisions of Oakland-Berkeley-Livermore, San Francisco-San Mateo-Redwood City,,San Rafael',
 ' Botetourt, Craig, Franklin,,Roanoke Counties,Roanoke,Salem Cities',
 ' Augusta County,Staunton,Waynesboro Cities ']

In [43]:
#Here we clean singleState_cities_list and put all the counties and cities into a lists within a list

i = 0
lenghtOfList = len(singleState_cities_list)

while i < lenghtOfList:
    #Convert list entries into lists themselves split by ',' and remove empty entries
    singleState_cities_list[i] = singleState_cities_list[i].split(",") 
    singleState_cities_list[i] = list(filter(None, singleState_cities_list[i]))
    
    #remove whitespaces before and after individual county names
    n = 0
    while n < len(singleState_cities_list[i]):
        singleState_cities_list[i][n] = singleState_cities_list[i][n].strip()
        n = n + 1
        
    i = i + 1
    
singleState_cities_list

[['Anne Arundel',
  'Baltimore',
  'Carroll',
  'Harford',
  'Howard',
  "Queen Anne's Counties",
  'Baltimore City'],
 ['Giles', 'Montgomery', 'Pulaski Counties', 'Radford City'],
 ['Carson City'],
 ['Albemarle',
  'Buckingham',
  'Fluvanna',
  'Greene',
  'Nelson Counties',
  'Charlottesville City'],
 ['Rockingham County', 'Harrisonburg City'],
 ['Amherst', 'Appomattox', 'Bedford', 'Campbell Counties', 'Lynchburg City'],
 ['Amelia',
  'Charles City',
  'Chesterfield',
  'Dinwiddie',
  'Goochland',
  'Hanover',
  'Henrico',
  'King',
  'Queen',
  'King William',
  'New Kent',
  'Powhatan',
  'Prince George',
  'Sussex Counties',
  'Colonial Heights',
  'Hopewell',
  'Petersburg',
  'Richmond Cities'],
 ['the Metropolitan Divisions of Oakland-Berkeley-Livermore',
  'San Francisco-San Mateo-Redwood City',
  'San Rafael'],
 ['Botetourt',
  'Craig',
  'Franklin',
  'Roanoke Counties',
  'Roanoke',
  'Salem Cities'],
 ['Augusta County', 'Staunton', 'Waynesboro Cities']]

In [44]:
#Now have to remove 'counties' text by putting 'county' by every individual county

singleState_cities_list

for item in singleState_cities_list:
    i = 0
    count = 0
    while i < len(item):
        if 'Counties' in item[i]:
            count = i + 1
            pos = item[i].find('Counties')
            item[i] = item[i][0: pos - 1]
            
            for n in range(count):
                item[n] = item[n] + ' County'
                
        if 'Cities' in item[i]:
            pos = item[i].find('Cities')
            item[i] = item[i][0: pos - 1]
            for n in range(count, i+1):
                if 'County' not in item[n]:
                    item[n] = item[n] + ' City'
        
        i = i + 1

singleState_cities_list

[['Anne Arundel County',
  'Baltimore County',
  'Carroll County',
  'Harford County',
  'Howard County',
  "Queen Anne's County",
  'Baltimore City'],
 ['Giles County', 'Montgomery County', 'Pulaski County', 'Radford City'],
 ['Carson City'],
 ['Albemarle County',
  'Buckingham County',
  'Fluvanna County',
  'Greene County',
  'Nelson County',
  'Charlottesville City'],
 ['Rockingham County', 'Harrisonburg City'],
 ['Amherst County',
  'Appomattox County',
  'Bedford County',
  'Campbell County',
  'Lynchburg City'],
 ['Amelia County',
  'Charles City County',
  'Chesterfield County',
  'Dinwiddie County',
  'Goochland County',
  'Hanover County',
  'Henrico County',
  'King County',
  'Queen County',
  'King William County',
  'New Kent County',
  'Powhatan County',
  'Prince George County',
  'Sussex County',
  'Colonial Heights City',
  'Hopewell City',
  'Petersburg City',
  'Richmond City'],
 ['the Metropolitan Divisions of Oakland-Berkeley-Livermore',
  'San Francisco-San Mateo

In [45]:
#We create two lists with corresponding MSA's and counties

singleState_cities_MSA_list_processed = []
singleState_cities_list_processed = []

i = 0
lenghtOfList = len(singleState_cities_MSA_list)

while i < lenghtOfList:
    n = 0
    while n < len(singleState_cities_list[i]):
        singleState_cities_MSA_list_processed.append(singleState_cities_MSA_list[i])
        singleState_cities_list_processed.append(singleState_cities_list[i][n])
        n = n + 1
        
    i = i + 1

In [46]:
singleState_cities_list_processed

['Anne Arundel County',
 'Baltimore County',
 'Carroll County',
 'Harford County',
 'Howard County',
 "Queen Anne's County",
 'Baltimore City',
 'Giles County',
 'Montgomery County',
 'Pulaski County',
 'Radford City',
 'Carson City',
 'Albemarle County',
 'Buckingham County',
 'Fluvanna County',
 'Greene County',
 'Nelson County',
 'Charlottesville City',
 'Rockingham County',
 'Harrisonburg City',
 'Amherst County',
 'Appomattox County',
 'Bedford County',
 'Campbell County',
 'Lynchburg City',
 'Amelia County',
 'Charles City County',
 'Chesterfield County',
 'Dinwiddie County',
 'Goochland County',
 'Hanover County',
 'Henrico County',
 'King County',
 'Queen County',
 'King William County',
 'New Kent County',
 'Powhatan County',
 'Prince George County',
 'Sussex County',
 'Colonial Heights City',
 'Hopewell City',
 'Petersburg City',
 'Richmond City',
 'the Metropolitan Divisions of Oakland-Berkeley-Livermore',
 'San Francisco-San Mateo-Redwood City',
 'San Rafael',
 'Botetourt C

In [47]:
#Create dataframe that joins MSA and individual Counties for single state counties only
singleState_cities_df = pd.DataFrame(columns = ['MSA', 'County/City'])

singleState_cities_df['MSA'] = pd.Series(singleState_cities_MSA_list_processed)

singleState_cities_df['County/City'] = pd.Series(singleState_cities_list_processed)

singleState_cities_df

Unnamed: 0,MSA,County/City
0,"Baltimore-Columbia-Towson, MD M.S.A.",Anne Arundel County
1,"Baltimore-Columbia-Towson, MD M.S.A.",Baltimore County
2,"Baltimore-Columbia-Towson, MD M.S.A.",Carroll County
3,"Baltimore-Columbia-Towson, MD M.S.A.",Harford County
4,"Baltimore-Columbia-Towson, MD M.S.A.",Howard County
5,"Baltimore-Columbia-Towson, MD M.S.A.",Queen Anne's County
6,"Baltimore-Columbia-Towson, MD M.S.A.",Baltimore City
7,"Blacksburg-Christiansburg, VA M.S.A.",Giles County
8,"Blacksburg-Christiansburg, VA M.S.A.",Montgomery County
9,"Blacksburg-Christiansburg, VA M.S.A.",Pulaski County


In [48]:
#Store state codes for singleStates 
state_code_list = []

for MSA_address in singleState_cities_MSA_list_processed:
    state_code = re.findall("[A-Z][A-Z]", MSA_address)
    state_code_list.append(state_code[0])
    
state_code_list

['MD',
 'MD',
 'MD',
 'MD',
 'MD',
 'MD',
 'MD',
 'VA',
 'VA',
 'VA',
 'VA',
 'NV',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'CA',
 'CA',
 'CA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA',
 'VA']

In [49]:
singleState_cities_df['state code'] = pd.Series(state_code_list)
singleState_cities_df

Unnamed: 0,MSA,County/City,state code
0,"Baltimore-Columbia-Towson, MD M.S.A.",Anne Arundel County,MD
1,"Baltimore-Columbia-Towson, MD M.S.A.",Baltimore County,MD
2,"Baltimore-Columbia-Towson, MD M.S.A.",Carroll County,MD
3,"Baltimore-Columbia-Towson, MD M.S.A.",Harford County,MD
4,"Baltimore-Columbia-Towson, MD M.S.A.",Howard County,MD
5,"Baltimore-Columbia-Towson, MD M.S.A.",Queen Anne's County,MD
6,"Baltimore-Columbia-Towson, MD M.S.A.",Baltimore City,MD
7,"Blacksburg-Christiansburg, VA M.S.A.",Giles County,VA
8,"Blacksburg-Christiansburg, VA M.S.A.",Montgomery County,VA
9,"Blacksburg-Christiansburg, VA M.S.A.",Pulaski County,VA


In [50]:
state_list = []

for state_code in state_code_list:
    state = state_dict[state_code]
    state_list.append(state)
    
state_list

['Maryland',
 'Maryland',
 'Maryland',
 'Maryland',
 'Maryland',
 'Maryland',
 'Maryland',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Nevada',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'California',
 'California',
 'California',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia',
 'Virginia']

In [51]:
singleState_cities_df['State'] = pd.Series(state_list)
singleState_cities_df = singleState_cities_df.drop(['state code'], axis=1)
singleState_cities_df

Unnamed: 0,MSA,County/City,State
0,"Baltimore-Columbia-Towson, MD M.S.A.",Anne Arundel County,Maryland
1,"Baltimore-Columbia-Towson, MD M.S.A.",Baltimore County,Maryland
2,"Baltimore-Columbia-Towson, MD M.S.A.",Carroll County,Maryland
3,"Baltimore-Columbia-Towson, MD M.S.A.",Harford County,Maryland
4,"Baltimore-Columbia-Towson, MD M.S.A.",Howard County,Maryland
5,"Baltimore-Columbia-Towson, MD M.S.A.",Queen Anne's County,Maryland
6,"Baltimore-Columbia-Towson, MD M.S.A.",Baltimore City,Maryland
7,"Blacksburg-Christiansburg, VA M.S.A.",Giles County,Virginia
8,"Blacksburg-Christiansburg, VA M.S.A.",Montgomery County,Virginia
9,"Blacksburg-Christiansburg, VA M.S.A.",Pulaski County,Virginia


### Multi States

In [52]:
multiState_countiesCities_MSA_df = MSA_to_counties_cities_df[MSA_to_counties_cities_df['MSA'].str.contains("[A-Z][A-Z]-[A-Z][A-Z]")]
multiState_MSA_list = multiState_countiesCities_MSA_df['MSA'].tolist()
multiState_MSA_list

['Allentown-Bethlehem-Easton, PA-NJ M.S.A.',
 'Augusta-Richmond County, GA-SC M.S.A.',
 'Boston-Cambridge-Newton, MA-NH M.S.A.',
 'Cape Girardeau, MO-IL M.S.A.',
 'Charlotte-Concord-Gastonia, NC-SC M.S.A.',
 'Chattanooga, TN-GA M.S.A.',
 'Chicago-Naperville-Elgin, IL-IN-WI M.S.A., ',
 'Lake County-Kenosha County, IL-WI M.D., ',
 'Cincinnati, OH-KY-IN M.S.A.',
 'Clarksville, TN-KY M.S.A.',
 'Cumberland, MD-WV M.S.A.',
 'Duluth, MN-WI M.S.A.',
 'Evansville, IN-KY M.S.A.',
 'Fargo, ND-MN M.S.A.',
 'Fort Smith, AR-OK M.S.A.',
 'Grand Forks, ND-MN M.S.A.',
 'Hagerstown-Martinsburg, MD-WV M.S.A.',
 'Kingsport-Bristol, TN-VA M.S.A.',
 'La Crosse-Onalaska, WI-MN M.S.A.',
 'Lewiston, ID-WA M.S.A.',
 'Logan, UT-ID M.S.A.',
 'Memphis, TN-MS-AR M.S.A., ',
 'Minneapolis-St. Paul-Bloomington, MN-WI M.S.A.',
 'Myrtle Beach-Conway-North Myrtle Beach, SC-NC M.S.A., ',
 'Philadelphia-Camden-Wilmington, PA-NJ-DE-MD M.S.A.',
 'Wilmington, DE-MD-NJ M.D.',
 'Portland-Vancouver-Hillsboro, OR-WA M.S.A.',
 'Pr

#### Mulit-state counties only

In [53]:
multiState_countiesCities_MSA_df['County/City'] = multiState_countiesCities_MSA_df['County/City'].str.replace("Includes", "")
multiState_countiesCities_MSA_df['County/City'] = multiState_countiesCities_MSA_df['County/City'].str.replace(" and ", ",")
multiState_countiesCities_MSA_df['County/City'] = multiState_countiesCities_MSA_df['County/City'].str.replace(";", ",")
multiState_countiesCities_MSA_df['County/City'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiState_countiesCities_MSA_df['County/City'] = multiState_countiesCities_MSA_df['County/City'].str.replace("Includes", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiState_countiesCities_MSA_df['County/City'] = multiState_countiesCities_MSA_df['County/City'].str.replace(" and ", ",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

[' Warren County, NJ,Carbon, Lehigh,,Northampton Counties, PA',
 ' Burke, Columbia, Lincoln, McDuffie,,Richmond Counties, GA,Aiken,Edgefield Counties, SC',
 ' the Metropolitan Divisions of Boston, MA, Cambridge-Newton-Framingham, MA,,Rockingham County-Strafford County, NH',
 ' Alexander County, IL,Bollinger,Cape Girardeau Counties, MO',
 ' Anson, Cabarrus, Gaston, Iredell, Lincoln, Mecklenburg, Rowan,,Union Counties, NC,Chester, Lancaster,,York Counties, SC',
 ' Catoosa, Dade,,Walker Counties, GA,Hamilton, Marion,,Sequatchie Counties, TN',
 ' the Metropolitan Divisions of Chicago-Naperville-Evanston, IL, Elgin, IL, Gary, IN,,Lake County-Kenosha County, IL-WI',
 ' Lake County, IL,Kenosha County, WI',
 ' Dearborn, Franklin, Ohio,,Union Counties, IN, Boone, Bracken, Campbell, Gallatin, Grant, Kenton,,Pendleton Counties, KY,,Brown, Butler, Clermont, Hamilton,,Warren Counties, OH',
 ' Christian,Trigg Counties, KY,Montgomery,Stewart Counties, TN',
 ' Allegany County, MD,Mineral County, WV',


In [54]:
multiState_countiesCities_list = multiState_countiesCities_MSA_df['County/City'].tolist()

i = 0
lenghtOfList = len(multiState_countiesCities_list)

while i < lenghtOfList:
    #Convert list entries into lists themselves split by ',' and remove empty entries
    multiState_countiesCities_list[i] = multiState_countiesCities_list[i].split(",") 
    multiState_countiesCities_list[i] = list(filter(None, multiState_countiesCities_list[i]))
    
    #remove whitespaces before and after individual county names
    n = 0
    while n < len(multiState_countiesCities_list[i]):
        multiState_countiesCities_list[i][n] = multiState_countiesCities_list[i][n].strip()
        n = n + 1
        
    i = i + 1
    
multiState_countiesCities_list

[['Warren County', 'NJ', 'Carbon', 'Lehigh', 'Northampton Counties', 'PA'],
 ['Burke',
  'Columbia',
  'Lincoln',
  'McDuffie',
  'Richmond Counties',
  'GA',
  'Aiken',
  'Edgefield Counties',
  'SC'],
 ['the Metropolitan Divisions of Boston',
  'MA',
  'Cambridge-Newton-Framingham',
  'MA',
  'Rockingham County-Strafford County',
  'NH'],
 ['Alexander County', 'IL', 'Bollinger', 'Cape Girardeau Counties', 'MO'],
 ['Anson',
  'Cabarrus',
  'Gaston',
  'Iredell',
  'Lincoln',
  'Mecklenburg',
  'Rowan',
  'Union Counties',
  'NC',
  'Chester',
  'Lancaster',
  'York Counties',
  'SC'],
 ['Catoosa',
  'Dade',
  'Walker Counties',
  'GA',
  'Hamilton',
  'Marion',
  'Sequatchie Counties',
  'TN'],
 ['the Metropolitan Divisions of Chicago-Naperville-Evanston',
  'IL',
  'Elgin',
  'IL',
  'Gary',
  'IN',
  'Lake County-Kenosha County',
  'IL-WI'],
 ['Lake County', 'IL', 'Kenosha County', 'WI'],
 ['Dearborn',
  'Franklin',
  'Ohio',
  'Union Counties',
  'IN',
  'Boone',
  'Bracken',
  'Ca

In [55]:
new_multiState_MSA_list = []
new_multi_countiesCities_list = []

i = 0
while i < len(multiState_countiesCities_list):
    
    last_state_pos = 0
    n = 0
    while n < len(multiState_countiesCities_list[i]):
            if multiState_countiesCities_list[i][n] in state_dict:
                list_entry = multiState_countiesCities_list[i][last_state_pos : n + 1]
                new_multi_countiesCities_list.append(list_entry)
                for m in range(len(list_entry)-1):
                    new_multiState_MSA_list.append(multiState_MSA_list[i])
                last_state_pos = n + 1
            n = n + 1     
    i = i + 1


print(len(new_multiState_MSA_list))

217


In [56]:
new_multi_countiesCities_list

[['Warren County', 'NJ'],
 ['Carbon', 'Lehigh', 'Northampton Counties', 'PA'],
 ['Burke', 'Columbia', 'Lincoln', 'McDuffie', 'Richmond Counties', 'GA'],
 ['Aiken', 'Edgefield Counties', 'SC'],
 ['the Metropolitan Divisions of Boston', 'MA'],
 ['Cambridge-Newton-Framingham', 'MA'],
 ['Rockingham County-Strafford County', 'NH'],
 ['Alexander County', 'IL'],
 ['Bollinger', 'Cape Girardeau Counties', 'MO'],
 ['Anson',
  'Cabarrus',
  'Gaston',
  'Iredell',
  'Lincoln',
  'Mecklenburg',
  'Rowan',
  'Union Counties',
  'NC'],
 ['Chester', 'Lancaster', 'York Counties', 'SC'],
 ['Catoosa', 'Dade', 'Walker Counties', 'GA'],
 ['Hamilton', 'Marion', 'Sequatchie Counties', 'TN'],
 ['the Metropolitan Divisions of Chicago-Naperville-Evanston', 'IL'],
 ['Elgin', 'IL'],
 ['Gary', 'IN'],
 ['Lake County', 'IL'],
 ['Kenosha County', 'WI'],
 ['Dearborn', 'Franklin', 'Ohio', 'Union Counties', 'IN'],
 ['Boone',
  'Bracken',
  'Campbell',
  'Gallatin',
  'Grant',
  'Kenton',
  'Pendleton Counties',
  'KY'],

In [57]:
state_list = []

for item in new_multi_countiesCities_list:
    state_list.append(state_dict[item[-1]])
    item.pop(-1)
    
state_list

['New Jersey',
 'Pennsylvania',
 'Georgia',
 'South Carolina',
 'Massachusetts',
 'Massachusetts',
 'New Hampshire',
 'Illinois',
 'Missouri',
 'North Carolina',
 'South Carolina',
 'Georgia',
 'Tennessee',
 'Illinois',
 'Illinois',
 'Indiana',
 'Illinois',
 'Wisconsin',
 'Indiana',
 'Kentucky',
 'Ohio',
 'Kentucky',
 'Tennessee',
 'Maryland',
 'West Virginia',
 'Minnesota',
 'Wisconsin',
 'Indiana',
 'Kentucky',
 'Minnesota',
 'North Dakota',
 'Arkansas',
 'Oklahoma',
 'Minnesota',
 'North Dakota',
 'Maryland',
 'West Virginia',
 'Tennessee',
 'Virginia',
 'Minnesota',
 'Wisconsin',
 'Idaho',
 'Washington',
 'Idaho',
 'Utah',
 'Arkansas',
 'Mississippi',
 'Tennessee',
 'Minnesota',
 'Wisconsin',
 'North Carolina',
 'South Carolina',
 'New Jersey',
 'Pennsylvania',
 'Pennsylvania',
 'Delaware',
 'Maryland',
 'New Jersey',
 'Oregon',
 'Washington',
 'Massachusetts',
 'Rhode Island',
 'Delaware',
 'Maryland',
 'Indiana',
 'Michigan',
 'Kansas',
 'Missouri',
 'Illinois',
 'Missouri',
 'Ar

In [58]:
new_multi_countiesCities_list

[['Warren County'],
 ['Carbon', 'Lehigh', 'Northampton Counties'],
 ['Burke', 'Columbia', 'Lincoln', 'McDuffie', 'Richmond Counties'],
 ['Aiken', 'Edgefield Counties'],
 ['the Metropolitan Divisions of Boston'],
 ['Cambridge-Newton-Framingham'],
 ['Rockingham County-Strafford County'],
 ['Alexander County'],
 ['Bollinger', 'Cape Girardeau Counties'],
 ['Anson',
  'Cabarrus',
  'Gaston',
  'Iredell',
  'Lincoln',
  'Mecklenburg',
  'Rowan',
  'Union Counties'],
 ['Chester', 'Lancaster', 'York Counties'],
 ['Catoosa', 'Dade', 'Walker Counties'],
 ['Hamilton', 'Marion', 'Sequatchie Counties'],
 ['the Metropolitan Divisions of Chicago-Naperville-Evanston'],
 ['Elgin'],
 ['Gary'],
 ['Lake County'],
 ['Kenosha County'],
 ['Dearborn', 'Franklin', 'Ohio', 'Union Counties'],
 ['Boone',
  'Bracken',
  'Campbell',
  'Gallatin',
  'Grant',
  'Kenton',
  'Pendleton Counties'],
 ['Brown', 'Butler', 'Clermont', 'Hamilton', 'Warren Counties'],
 ['Christian', 'Trigg Counties'],
 ['Montgomery', 'Stewart

In [59]:
#Now have to remove 'counties' text by putting 'county' by every individual county

for item in new_multi_countiesCities_list:
    i = 0
    count = 0
    while i < len(item):
        if 'Counties' in item[i]:
            count = i + 1
            pos = item[i].find('Counties')
            item[i] = item[i][0: pos - 1]
            
            for n in range(count):
                item[n] = item[n] + ' County'
                
        if 'Cities' in item[i]:
            pos = item[i].find('Cities')
            item[i] = item[i][0: pos - 1]
            for n in range(count, i+1):
                if 'County' not in item[n]:
                    item[n] = item[n] + ' City'
        
        i = i + 1

new_multi_countiesCities_list

[['Warren County'],
 ['Carbon County', 'Lehigh County', 'Northampton County'],
 ['Burke County',
  'Columbia County',
  'Lincoln County',
  'McDuffie County',
  'Richmond County'],
 ['Aiken County', 'Edgefield County'],
 ['the Metropolitan Divisions of Boston'],
 ['Cambridge-Newton-Framingham'],
 ['Rockingham County-Strafford County'],
 ['Alexander County'],
 ['Bollinger County', 'Cape Girardeau County'],
 ['Anson County',
  'Cabarrus County',
  'Gaston County',
  'Iredell County',
  'Lincoln County',
  'Mecklenburg County',
  'Rowan County',
  'Union County'],
 ['Chester County', 'Lancaster County', 'York County'],
 ['Catoosa County', 'Dade County', 'Walker County'],
 ['Hamilton County', 'Marion County', 'Sequatchie County'],
 ['the Metropolitan Divisions of Chicago-Naperville-Evanston'],
 ['Elgin'],
 ['Gary'],
 ['Lake County'],
 ['Kenosha County'],
 ['Dearborn County', 'Franklin County', 'Ohio County', 'Union County'],
 ['Boone County',
  'Bracken County',
  'Campbell County',
  'Gal

In [60]:
#We create two lists with corresponding MSA's and counties

multiState_countiesCities_MSA_list_processed = new_multiState_MSA_list
multiState_countiesCities_list_processed = []
newState_list = []

i = 0
lenghtOfList = len(state_list)

while i < lenghtOfList:
    n = 0
    while n < len(new_multi_countiesCities_list[i]):
        multiState_countiesCities_list_processed.append(new_multi_countiesCities_list[i][n])
        newState_list.append(state_list[i])
        n = n + 1
        
    i = i + 1

In [61]:
print(len(multiState_countiesCities_MSA_list_processed))

217


In [62]:
#multiState_countiesCities_MSA_list_processed

In [63]:
#Create dataframe 
multiState_countiesCities_df = pd.DataFrame(columns = ['MSA', 'County/City', 'State'])
multiState_countiesCities_df['MSA'] = pd.Series(multiState_countiesCities_MSA_list_processed)
multiState_countiesCities_df['County/City'] = pd.Series(multiState_countiesCities_list_processed)
multiState_countiesCities_df['State'] = pd.Series(newState_list)
multiState_countiesCities_df.head(50)

Unnamed: 0,MSA,County/City,State
0,"Allentown-Bethlehem-Easton, PA-NJ M.S.A.",Warren County,New Jersey
1,"Allentown-Bethlehem-Easton, PA-NJ M.S.A.",Carbon County,Pennsylvania
2,"Allentown-Bethlehem-Easton, PA-NJ M.S.A.",Lehigh County,Pennsylvania
3,"Allentown-Bethlehem-Easton, PA-NJ M.S.A.",Northampton County,Pennsylvania
4,"Augusta-Richmond County, GA-SC M.S.A.",Burke County,Georgia
5,"Augusta-Richmond County, GA-SC M.S.A.",Columbia County,Georgia
6,"Augusta-Richmond County, GA-SC M.S.A.",Lincoln County,Georgia
7,"Augusta-Richmond County, GA-SC M.S.A.",McDuffie County,Georgia
8,"Augusta-Richmond County, GA-SC M.S.A.",Richmond County,Georgia
9,"Augusta-Richmond County, GA-SC M.S.A.",Aiken County,South Carolina


In [64]:
multiState_countiesCities_df['County/City'].iloc[34]

'the Metropolitan Divisions of Chicago-Naperville-Evanston'

### Create main dataframe

In [65]:
"""
Joining the following DataFrames
singleState_counties_df
singleState_cities_df
multiState_countiesCities_df
"""


MSA_to_countyCity_df = singleState_counties_df.append(singleState_cities_df.append(multiState_countiesCities_df))

MSA_to_countyCity_df = MSA_to_countyCity_df.reset_index(drop=True)
MSA_to_countyCity_df

Unnamed: 0,MSA,County/City,State
0,"Abilene, TX M.S.A.",Callahan County,Texas
1,"Abilene, TX M.S.A.",Jones County,Texas
2,"Abilene, TX M.S.A.",Taylor County,Texas
3,"Akron, OH M.S.A.",Portage County,Ohio
4,"Akron, OH M.S.A.",Summit County,Ohio
...,...,...,...
1011,"Winchester, VA-WV M.S.A.",Frederick County,Virginia
1012,"Winchester, VA-WV M.S.A.",Winchester City,Virginia
1013,"Winchester, VA-WV M.S.A.",Hampshire County,West Virginia
1014,"Worcester, MA-CT M.S.A.",Windham County,Connecticut


# Join two major datasets 

In [66]:
msa_pop_df

Unnamed: 0,MSA,Population
0,"Abilene, TX M.S.A.",170417
1,"Akron, OH M.S.A.",704283
2,"Albany, GA M.S.A.",147142
3,"Albany-Schenectady-Troy, NY M.S.A.",871741
4,"Albuquerque, NM M.S.A.",915468
...,...,...
359,"Worcester, MA-CT M.S.A.",871779
360,"Yakima, WA M.S.A.",252019
361,"York-Hanover, PA M.S.A.",447168
362,"Yuba City, CA M.S.A.",173299


In [67]:
diabetes_by_countyCity_df

Unnamed: 0,County/City,State,Diagnosed Diabetes Percentage
0,Autauga County,Alabama,9.5
1,Baldwin County,Alabama,8.4
2,Barbour County,Alabama,13.5
3,Bibb County,Alabama,10.2
4,Blount County,Alabama,10.5
...,...,...,...
3136,Sweetwater County,Wyoming,7.8
3137,Teton County,Wyoming,3.8
3138,Uinta County,Wyoming,8.4
3139,Washakie County,Wyoming,7.4


In [68]:
#merge MSA_to_countyCity_df and diabetes_by_countyCity_df
msa_diabetes_df = pd.merge(MSA_to_countyCity_df, diabetes_by_countyCity_df, on=["County/City","State"])
msa_diabetes_df

Unnamed: 0,MSA,County/City,State,Diagnosed Diabetes Percentage
0,"Abilene, TX M.S.A.",Callahan County,Texas,7.6
1,"Abilene, TX M.S.A.",Jones County,Texas,7.2
2,"Abilene, TX M.S.A.",Taylor County,Texas,7.7
3,"Akron, OH M.S.A.",Portage County,Ohio,8.4
4,"Akron, OH M.S.A.",Summit County,Ohio,10.3
...,...,...,...,...
928,"Winchester, VA-WV M.S.A.",Frederick County,Virginia,9.1
929,"Winchester, VA-WV M.S.A.",Winchester City,Virginia,6.9
930,"Winchester, VA-WV M.S.A.",Hampshire County,West Virginia,8.3
931,"Worcester, MA-CT M.S.A.",Windham County,Connecticut,9.6


In [69]:
b = msa_diabetes_df.groupby(['MSA']).sum()
b

Unnamed: 0_level_0,Diagnosed Diabetes Percentage
MSA,Unnamed: 1_level_1
"Abilene, TX M.S.A.",22.5
"Akron, OH M.S.A.",18.7
"Albany, GA M.S.A.",38.7
"Albany-Schenectady-Troy, NY M.S.A.",42.6
"Albuquerque, NM M.S.A.",34.3
...,...
"Worcester, MA-CT M.S.A.",17.6
"Yakima, WA M.S.A.",10.9
"York-Hanover, PA M.S.A.",9.3
"Yuba City, CA M.S.A.",17.6


In [70]:
#Here we create dictions of the frequency counts of both data sets for the rows for each MSA. If these numbers do not match
#then we know that not all the counties/cities for that MSA are not included so we know to get rid off that MSA

msaDiabetes_frequencyCounts = msa_diabetes_df['MSA'].value_counts().to_dict()
msaCountyCity_frequencyCounts = MSA_to_countyCity_df['MSA'].value_counts().to_dict()

In [71]:
#Here we check if the frequencies are the same and gather a list of MSA's that do not have the same frequencie
missing_list = []

for key, value in msaDiabetes_frequencyCounts.items():
    if msaDiabetes_frequencyCounts[key] != msaCountyCity_frequencyCounts[key]:
        missing_list.append(key)

missing_list

['Atlanta-Sandy Springs-Alpharetta, GA M.S.A.',
 'Washington-Arlington-Alexandria, DC-VA-MD-WV M.D.',
 'Virginia Beach-Norfolk-Newport News, VA-NC M.S.A., ',
 'Richmond, VA M.S.A.',
 'Memphis, TN-MS-AR M.S.A., ',
 'Oklahoma City, OK M.S.A.',
 'Augusta-Richmond County, GA-SC M.S.A.',
 'Sioux Falls, SD M.S.A.',
 'Chicago-Naperville-Evanston, IL M.D., ',
 'St. Joseph, MO-KS M.S.A., ',
 'Owensboro, KY M.S.A.',
 'Elgin, IL M.D.',
 'Brunswick, GA M.S.A.',
 'Seattle-Bellevue-Kent, WA M.D., ',
 'Waco, TX M.S.A.']

In [72]:
msa_diabetes_df = msa_diabetes_df[~msa_diabetes_df['MSA'].isin(missing_list)]
msa_diabetes_df

Unnamed: 0,MSA,County/City,State,Diagnosed Diabetes Percentage
0,"Abilene, TX M.S.A.",Callahan County,Texas,7.6
1,"Abilene, TX M.S.A.",Jones County,Texas,7.2
2,"Abilene, TX M.S.A.",Taylor County,Texas,7.7
3,"Akron, OH M.S.A.",Portage County,Ohio,8.4
4,"Akron, OH M.S.A.",Summit County,Ohio,10.3
...,...,...,...,...
928,"Winchester, VA-WV M.S.A.",Frederick County,Virginia,9.1
929,"Winchester, VA-WV M.S.A.",Winchester City,Virginia,6.9
930,"Winchester, VA-WV M.S.A.",Hampshire County,West Virginia,8.3
931,"Worcester, MA-CT M.S.A.",Windham County,Connecticut,9.6


In [73]:
#We need to also remove MSA's from msa_pop_rates_df that are in missing_list
msa_pop_rates_df = msa_pop_rates_df[~msa_pop_rates_df['MSA'].isin(missing_list)]

In [74]:
msa_pop_rates_df = msa_pop_rates_df.set_index('MSA')
msa_pop_rates_df.head()

Unnamed: 0_level_0,Population,Murder and nonnegligent manslaughter
MSA,Unnamed: 1_level_1,Unnamed: 2_level_1
"Abilene, TX M.S.A.",170417,9
"Akron, OH M.S.A.",704283,45
"Albany, GA M.S.A.",147142,17
"Albany-Schenectady-Troy, NY M.S.A.",871741,19
"Albuquerque, NM M.S.A.",915468,87


### Now add pop density

In [75]:
msa_pop_den_df.head()

Unnamed: 0,County/City,State,Land Area,Population,Population Density
0,Autauga County,Alabama,594.44,55533.0,93.420698
1,Baldwin County,Alabama,1589.78,217855.0,137.034684
2,Barbour County,Alabama,884.88,24872.0,28.107766
3,Bibb County,Alabama,622.58,22367.0,35.926307
4,Blount County,Alabama,644.78,57771.0,89.598002


In [76]:
msa_pop_den_df.shape

(2954, 5)

In [77]:
msa_pop_den_df = msa_pop_den_df.drop(['Land Area', 'Population'], axis = 1)
msa_pop_den_df

Unnamed: 0,County/City,State,Population Density
0,Autauga County,Alabama,93.420698
1,Baldwin County,Alabama,137.034684
2,Barbour County,Alabama,28.107766
3,Bibb County,Alabama,35.926307
4,Blount County,Alabama,89.598002
...,...,...,...
2949,Sweetwater County,Wyoming,4.110429
2950,Teton County,Wyoming,5.823977
2951,Uinta County,Wyoming,9.749863
2952,Washakie County,Wyoming,3.518796


In [78]:
msa_diabetes_df.shape

(812, 4)

In [79]:
#Now merge
new_df = pd.merge(msa_diabetes_df, msa_pop_den_df, on=["County/City","State"])
new_df

Unnamed: 0,MSA,County/City,State,Diagnosed Diabetes Percentage,Population Density
0,"Abilene, TX M.S.A.",Callahan County,Texas,7.6,15.555333
1,"Abilene, TX M.S.A.",Jones County,Texas,7.2,21.394648
2,"Abilene, TX M.S.A.",Taylor County,Texas,7.7,149.957949
3,"Akron, OH M.S.A.",Portage County,Ohio,8.4,333.419508
4,"Akron, OH M.S.A.",Summit County,Ohio,10.3,1311.576015
...,...,...,...,...,...
743,"Wheeling, WV-OH M.S.A.",Ohio County,West Virginia,8.4,394.112644
744,"Winchester, VA-WV M.S.A.",Frederick County,Virginia,9.1,213.235792
745,"Winchester, VA-WV M.S.A.",Hampshire County,West Virginia,8.3,36.481062
746,"Worcester, MA-CT M.S.A.",Windham County,Connecticut,9.6,228.225225


In [80]:
new_df = new_df.drop(['County/City', 'State'], axis=1)
new_df = new_df.groupby('MSA').agg({'Diagnosed Diabetes Percentage':'mean','Population Density': 'mean'})
new_df

Unnamed: 0_level_0,Diagnosed Diabetes Percentage,Population Density
MSA,Unnamed: 1_level_1,Unnamed: 2_level_1
"Abilene, TX M.S.A.",7.500,62.302643
"Akron, OH M.S.A.",9.350,822.497761
"Albany, GA M.S.A.",9.675,104.043825
"Albany-Schenectady-Troy, NY M.S.A.",8.520,384.616247
"Albuquerque, NM M.S.A.",8.575,174.850723
...,...,...
"Worcester, MA-CT M.S.A.",8.800,388.493888
"Yakima, WA M.S.A.",10.900,58.332635
"York-Hanover, PA M.S.A.",9.300,495.307350
"Yuba City, CA M.S.A.",8.800,141.342716


### Now join msa_pop_rates_df and msa_diabetes_df

In [81]:
new_df = new_df.merge(msa_pop_rates_df, on='MSA', how='left')
new_df

Unnamed: 0_level_0,Diagnosed Diabetes Percentage,Population Density,Population,Murder and nonnegligent manslaughter
MSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Abilene, TX M.S.A.",7.500,62.302643,170417,9
"Akron, OH M.S.A.",9.350,822.497761,704283,45
"Albany, GA M.S.A.",9.675,104.043825,147142,17
"Albany-Schenectady-Troy, NY M.S.A.",8.520,384.616247,871741,19
"Albuquerque, NM M.S.A.",8.575,174.850723,915468,87
...,...,...,...,...
"Worcester, MA-CT M.S.A.",8.800,388.493888,871779,11
"Yakima, WA M.S.A.",10.900,58.332635,252019,25
"York-Hanover, PA M.S.A.",9.300,495.307350,447168,24
"Yuba City, CA M.S.A.",8.800,141.342716,173299,8


In [82]:
new_df.isnull().sum()

Diagnosed Diabetes Percentage           0
Population Density                      0
Population                              0
Murder and nonnegligent manslaughter    0
dtype: int64

In [83]:
new_df = new_df[['Population', 'Population Density', 'Diagnosed Diabetes Percentage', 'Murder and nonnegligent manslaughter']]
new_df.head()

Unnamed: 0_level_0,Population,Population Density,Diagnosed Diabetes Percentage,Murder and nonnegligent manslaughter
MSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Abilene, TX M.S.A.",170417,62.302643,7.5,9
"Akron, OH M.S.A.",704283,822.497761,9.35,45
"Albany, GA M.S.A.",147142,104.043825,9.675,17
"Albany-Schenectady-Troy, NY M.S.A.",871741,384.616247,8.52,19
"Albuquerque, NM M.S.A.",915468,174.850723,8.575,87


In [84]:
msa_final_df = new_df.copy()

# Saving the Dataset

In [85]:
msa_final_df.to_csv(MSA_FINAL_DATASET)