In [None]:
## FILE FOR JOSH TO WORK IN
from sklearn.preprocessing import StandardScaler
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

In [None]:
# def get_column_indexes(dataframe, column_list):
#     column_indexes = []
#     for column_name in column_list:
#         column_indexes.append(dataframe.columns.get_loc(column_name))
#     return column_indexes

def build_score_column(label, dataframe, columns, column_coefficients):
    modified_df = dataframe[columns].copy()
    for i in range(len(column_coefficients)):
        modified_df.iloc[:, i] *=  column_coefficients[i]
    dataframe[label] = modified_df.sum(axis=1)
    

In [None]:
medical_data = pd.read_csv("MedicalData/medical_data.csv")

columns_to_standardize = ['imfant mortality 100k', 'homicides per 100k', 'overdose deaths per 100k', 'suicide per 100k', 'lifeExpectancey',
                          'percent use cigerrets', 'Gallons of Ethanol per Capita', 'Driving Fatalities Involving Alcohol (Percentage)',
                          'Excessive Drinking rate (Percentage)', 'cancer casees per 100k', '% deppressed']

# Here is where we define the scales of our scoring model. Because we want healthy sates to have and low scores
# and unhealthy sates to havel low scores we use positive or negative coefficients, for example the coefficient
# for infant mortality will be negative because a lower score is more "healty". The actual magnitude of the scalar
# impacts how heavily this column will impact our actuall health score. 
column_coefficients = [-2, -1, -1, -2, 2, -1, -.5, -1, -1, -1, -1]



# Coerce strings to NAN
medical_data[columns_to_standardize] = medical_data[columns_to_standardize].apply(pd.to_numeric, errors='coerce')

# Replace NAN with median values.
for column in columns_to_standardize:
    medical_data[column] = medical_data[column].fillna(medical_data[column].median())

scaler = StandardScaler()
# display(medical_data)
scaled_df = pd.DataFrame(scaler.fit_transform(medical_data[columns_to_standardize]))

# Replace non standardized columns in medical data with their standardized versions.
medical_data[columns_to_standardize] = scaled_df

# column_indexes = get_column_indexes(medical_data, columns_to_standardize)
# print(column_indexes)

build_score_column("Health_Score", medical_data, columns_to_standardize, column_coefficients)
display(medical_data)
print(medical_data.columns)


In [None]:
highest = medical_data.nlargest(20, 'Health_Score')
highest.head(20)

health_score = medical_data[['YEAR',"STATE","Health_Score"]]
health_score.to_csv('MedicalData/HealthScore.csv', index=False)



In [None]:
states = gpd.read_file("tl_2023_us_state/tl_2023_us_state.shp")
states = states.to_crs("EPSG:4326")

# states.plot()


non_continental = ['HI','VI','MP','GU','AK','AS','PR']
us49 = states
for n in non_continental:
    us49 = us49[us49.STUSPS != n]

states = us49

# us49.plot(column='')
us49.head()
# plt.show()

In [None]:
health_2020 = health_score[health_score['YEAR'] == 2020]

# display(health_2020)
# health_score.head()

merged = pd.merge(states, health_2020, left_on='STUSPS', right_on="STATE", how='left')

merged.head(100)



In [None]:
map = states.plot()
merged.plot(column='Health_Score', cmap='coolwarm', legend=True,ax=map)
# plt.show