In [1]:
## google drive to google colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv')

Since there are many measures in the Measures column after the EDA, we have grouped them into 2 different lists.

In [4]:
prevention_cols = [
    'ACCESS2', 'BPMED', 'CHECKUP', 'CHOLSCREEN',
    'COLON_SCREEN', 'COREM', 'COREW', 'DENTAL', 'MAMMOUSE', 'PAPTEST']

behavior_cols = ['BINGE', 'CSMOKING', 'LPA', 'OBESITY', 'SLEEP']

outcome_cols = [
    'ARTHRITIS', 'BPHIGH', 'CANCER',
    'CASTHMA', 'CHD', 'COPD',
    'DIABETES', 'HIGHCHOL', 'KIDNEY',
    'MHLTH', 'PHLTH', 'STROKE', 'TEETHLOST']

Also we stating which columns to drop and columns to use in the dataset as a list.

In [5]:
columns_to_drop = [ 'Year', 'StateAbbr', 'DataSource', 'Measure','Data_Value_Unit',
                    'Data_Value_Footnote', 'Data_Value_Type', 'Low_Confidence_Limit',
                    'High_Confidence_Limit', 'Data_Value_Footnote_Symbol',
                    'CategoryID', 'Short_Question_Text']

columns_to_keep = ['StateDesc', 'Category', 'CityName', 'UniqueID', 'GeographicLevel', 'DataValueTypeID',
                   'PopulationCount', 'CityFIPS', 'TractFIPS', 'GeoLocation']

random_state = 10

In [6]:
def get_data(path):
    data = pd.read_csv(path)

    data = data.drop(columns=columns_to_drop)

    data = data.drop(data[data.DataValueTypeID == 'AgeAdjPrv'].index)

    census_tract_data = data[data['GeographicLevel'] == 'Census Tract']
    city_data = data[data['GeographicLevel'] == 'City']

    tract_pv = census_tract_data.pivot_table(index=['StateDesc', 'CityName', 'UniqueID'], columns='MeasureId',
                                             values='Data_Value',
                                             aggfunc='sum')
    print("Size of census tract data:", len(tract_pv))  
    tract_pv = tract_pv.fillna(tract_pv.mean())

    city_pv = city_data.pivot_table(index=['StateDesc', 'CityName', 'UniqueID'], columns='MeasureId',
                                    values='Data_Value', aggfunc='sum')
    print("Size of city data:", len(city_pv))  
    city_pv = city_pv.fillna(city_pv.mean())

    city_pv.reset_index(level=0, inplace=True)
    tract_pv.reset_index(level=0, inplace=True)

    return data, city_pv, tract_pv

Importing all the necessary libraries

In [10]:
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [17]:
def multi_model_analysis(clean_data):
    # split data to training and testing set
    x_data = clean_data.loc[:, clean_data.columns.isin(prevention_cols + behavior_cols)]
    i = 0
    for outcome_col in outcome_cols:
        print("-----------------------------------")
        print("#", i)

        print("Health Outcome to analyze: ", outcome_col)
        i += 1

        y_data = clean_data.loc[:, outcome_col]
        # split data to training and testing set
        x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.7,
                                                            random_state=random_state)

        # Linear Regression
        reg = LinearRegression().fit(x_train, y_train)
        # print(reg.intercept_)
        print("lr coef:", reg.coef_)
        print("lr train score:", round(reg.score(x_train, y_train), 4))
        print("lr test score:", round(reg.score(x_test, y_test), 4))

        # Ridge Regression
        reg = Ridge().fit(x_train, y_train)
        print(reg.intercept_)
        print(reg.coef_)
        print(reg)
        print("Ridge train score:", reg.score(x_train, y_train))
        print("Ridge test score:", reg.score(x_test, y_test))

        # Lasso Regression
        reg = Lasso().fit(x_train, y_train)
        print("Lasso coef:", reg.coef_)
        print("Lasso train score:", round(reg.score(x_train, y_train), 4))
        print("Lasso test score:", round(reg.score(x_test, y_test), 4))


In [12]:
import logging
import sys
import warnings

Let us define the main function.

The main function is executed twice to show state wise results and National level results.

In [19]:
def initial_setup():
    warnings.filterwarnings("ignore")  # To ignore warnings from sklean
    global logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    logger.addHandler(ch)
    logger.info("Logger ready")

def main():
    initial_setup()
    print("Accepted mode: \n1. Run multiple models comparision analysis.\n")
    try:
        mode = int(input('Enter the number of your selected mode:\n'))
    except ValueError:
        print("Not a number")

    path = "/content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv"
    data, city_pv, tract_pv = get_data(path)
    tract_pv.reset_index(level=0, inplace=True)
    logging.info("Get formatted pivot table data by city and census track from file: " + path)

    if mode == 1:
        print("You have choose to run multiple models comparision analysis.\n")
        level = input("Enter the level of analysis you want to perform (State/US):\n")
        if level == 'US':
            print("About to analysis on national level")
            multi_model_analysis(city_pv)
        elif level == 'State':
            show_state = input("Do you wish to see all the state names? (Y/N)")
            if show_state == 'Y':
                print( np.unique(tract_pv['StateDesc']) )

            state = input("Enter the name of the state you want to analyze:\n")
            state_data = tract_pv[tract_pv['StateDesc'] == state]
            if state_data.shape[0] == 0:
                print("Bad state name, abort!")
                sys.exit()
            print("About to run model selection for state: ", state, " with ", len(state_data), " census track.")
            multi_model_analysis(state_data)
        else:
            print("Unacceptable input, abort!")


main()


INFO:root:Logger ready
Logger ready
Logger ready
Logger ready
Logger ready
Logger ready
Logger ready


Accepted mode: 
1. Run multiple models comparision analysis.

Enter the number of your selected mode:
1


INFO:root:Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file:

Size of census tract data: 28004
Size of city data: 500
You have choose to run multiple models comparision analysis.

Enter the level of analysis you want to perform (State/US):
State
Do you wish to see all the state names? (Y/N)Y
['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of C' 'Florida' 'Georgia' 'Hawaii'
 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky' 'Louisiana'
 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota' 'Mississippi'
 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire' 'New Jersey'
 'New Mexico' 'New York' 'North Carolin' 'North Dakota' 'Ohio' 'Oklahoma'
 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolin' 'South Dakota'
 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia' 'Washington'
 'West Virginia' 'Wisconsin' 'Wyoming']
Enter the name of the state you want to analyze:
Alaska
About to run model selection for state:  Alaska  with  55  census track.
-----------------------------------
# 0
Heal

Now let us execute the main function again to see National level results

In [20]:
def initial_setup():
    warnings.filterwarnings("ignore")  # To ignore warnings from sklean
    global logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    logger.addHandler(ch)
    logger.info("Logger ready")

def main():
    initial_setup()
    print("Accepted mode: \n1. Run multiple models comparision analysis.\n")
    try:
        mode = int(input('Enter the number of your selected mode:\n'))
    except ValueError:
        print("Not a number")

    path = "/content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv"
    data, city_pv, tract_pv = get_data(path)
    tract_pv.reset_index(level=0, inplace=True)
    logging.info("Get formatted pivot table data by city and census track from file: " + path)

    if mode == 1:
        print("You have choose to run multiple models comparision analysis.\n")
        level = input("Enter the level of analysis you want to perform (State/US):\n")
        if level == 'US':
            print("About to analysis on national level")
            multi_model_analysis(city_pv)
        elif level == 'State':
            show_state = input("Do you wish to see all the state names? (Y/N)")
            if show_state == 'Y':
                print( np.unique(tract_pv['StateDesc']) )

            state = input("Enter the name of the state you want to analyze:\n")
            state_data = tract_pv[tract_pv['StateDesc'] == state]
            if state_data.shape[0] == 0:
                print("Bad state name, abort!")
                sys.exit()
            print("About to run model selection for state: ", state, " with ", len(state_data), " census track.")
            multi_model_analysis(state_data)
        else:
            print("Unacceptable input, abort!")


main()

INFO:root:Logger ready
Logger ready
Logger ready
Logger ready
Logger ready
Logger ready
Logger ready
Logger ready


Accepted mode: 
1. Run multiple models comparision analysis.

Enter the number of your selected mode:
1


INFO:root:Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file: /content/gdrive/MyDrive/500_Cities__Local_Data_for_Better_Health__2018_release (1).csv
Get formatted pivot table data by city and census track from file:

Size of census tract data: 28004
Size of city data: 500
You have choose to run multiple models comparision analysis.

Enter the level of analysis you want to perform (State/US):
US
About to analysis on national level
-----------------------------------
# 0
Health Outcome to analyze:  ARTHRITIS
lr coef: [-0.04951063 -0.29566126  0.49716485 -0.17651153  0.18102031  0.12261768
 -0.00839581 -0.11227574  0.62315696  0.09225053 -0.03787054 -0.17382469
  0.15760971 -0.01171096  0.02264826]
lr train score: 0.8455
lr test score: 0.8268
-19.662916069664067
[-0.04978606 -0.29530243  0.49697395 -0.17619328  0.18093421  0.12258824
 -0.00839234 -0.11218978  0.62245362  0.09202337 -0.03763863 -0.17398862
  0.15778882 -0.01171708  0.0225762 ]
Ridge()
Ridge train score: 0.8455311571160908
Ridge test score: 0.8268958877709699
Lasso coef: [-0.12130705 -0.0303183   0.466717   -0.          0.05109257  0.
 -0.         -0.          0.34877278  0.          0.         -0.10402164
  0.14695428 -0.01634848  0.  