In [25]:
import pandas as pd
import numpy as np
import us
#from geopy.geocoders import Nominatim
from sklearn.neighbors import NearestNeighbors

In [26]:
#import all of the data we're going to use
demographicData = pd.read_csv("Data/Demographics/ACSDP1Y2019.csv", skiprows=[0])
incomeData = pd.read_csv("Data/Income/ACSST1Y2019.csv", skiprows=[0])
urbanData = pd.read_csv("2010CensusUrbanRural.csv", skiprows=[0])
latLongData = pd.read_csv("latitudeAndLongitude.csv")
educationData = pd.read_csv("Data/ACSST1Y2019.S1501_data_with_overlays_2020-10-22T140417.csv",skiprows=([0]))
urbanData = pd.read_csv("2010CensusUrbanRural.csv", skiprows = [0])
unemploymentData = pd.read_csv("Data/State unemployment - Sheet1.csv")

In [28]:
#delete "margin of error" data, which we will not be using
margins = [column for column in demographicData.columns if "Margin" in column]
demographicData = demographicData.drop(columns = margins)

In [29]:
margins = [column for column in educationData.columns if "Margin" in column]
educationData = educationData.drop(columns = margins)

In [30]:
#This took forever to run and the geographic positions of the states don't change enough for fresh data to
#matter,so I saved the lat/long data and loaded it separately
#geolocator = Nominatim(user_agent = "MDST-election-prediction")
#lat = {}
#long = {}
#for index, state in demographicData["Geographic Area Name"].iteritems():
#    loc = geolocator.geocode(state)
#    lat[state] = loc.latitude
#    long[state] = loc.longitude
#demographicData["Longitude"] = demographicData["Geographic Area Name"].map(long)
#demographicData["Latitude"] = demographicData["Geographic Area Name"].map(lat)

In [31]:
#latlongCols = ["Longitude", "Latitude"]

In [32]:
#latLongData = demographicData[latlongCols]
#latLongData.to_csv("latitudeAndLongitude.csv", index = False)

In [33]:
#isolate the features from the Demographic chart
feat_columns = ["Estimate!!SEX AND AGE!!Total population!!Median age (years)",
                "Percent!!RACE!!Total population!!One race!!Black or African American",
               "Percent!!RACE!!Total population!!One race!!American Indian and Alaska Native",
               "Percent!!RACE!!Total population!!One race!!Asian",
               "Percent!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)"]
edu_columns = ["Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years","Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!Bachelor's degree or higher","Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over","Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher","Estimate!!Percent!!RACE AND HISPANIC OR LATINO ORIGIN BY EDUCATIONAL ATTAINMENT!!White alone!!Bachelor's degree or higher"]

In [34]:
demoFeatData = demographicData[feat_columns]
educationData = educationData[edu_columns]

In [40]:
#Find the total number of people with a bachelors degree or higher, then find what percentage that is of the total population
educationData['total_with_college'] = educationData["Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!Bachelor's degree or higher"] + educationData["Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher"]
educationDataeducationData['total_pop'] = educationData["Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years"] + educationData["Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over"]
educationData['percent_with_degree'] = educationData['total_with_college'] / educationData['total_pop'] * 100

In [41]:
edu_feats = ["percent_with_degree", "Estimate!!Percent!!RACE AND HISPANIC OR LATINO ORIGIN BY EDUCATIONAL ATTAINMENT!!White alone!!Bachelor's degree or higher"]

In [42]:
eduFeatData = educationData[edu_feats]

In [45]:
demofeat_array = demoFeatData.to_numpy()
latLong_array = latLongData.to_numpy()

In [46]:
edufeat_array = eduFeatData.to_numpy()

In [47]:
#calculate the percentage of the rural-urban split, ie what percentage of people in a state live in a rural area
#we believe these areas will be more likely to vote for Trump, so it is important to know prominent rural voters
#are in a given state
urbanData["percentRural"] = (urbanData["Total!!Rural"]/(urbanData["Total!!Rural"] + urbanData["Total!!Urban"])) * 100
urbanRural_array = urbanData["percentRural"].to_numpy()

array([40.9631829 , 33.98021207, 10.19017941, 43.83967588,  5.04738342,
       26.81374924, 41.61952653, 13.84783969, 12.00736857, 16.70334345,
        0.        ,  8.83696934, 24.9338204 ,  8.0726251 , 29.42187394,
       11.51212972, 27.55639361, 35.98067198, 25.80184205, 61.34017786,
       12.80357395,  8.02794416, 25.43276566, 26.72764038, 50.65461934,
       29.56382671, 44.1069723 , 26.86546488,  5.80451915, 39.69691675,
        5.32044631, 22.5729769 , 12.12707519, 33.91256636, 40.10148813,
       22.07609862, 33.75642535, 18.9683624 , 21.34318304,  9.26534843,
       33.67421461, 43.34827679, 33.60896172, 15.30099885,  9.41754089,
       61.10451449, 24.54598311, 15.95158925, 51.27830959, 29.84617863,
       35.24198671,  6.24117469])

In [49]:
#clean up unemployment percentages
unemploymentData = unemploymentData.sort_values(by = "State")
unemploymentData = unemploymentData["Unemployment Percentage"]
unemployment_array = unemploymentData.to_numpy()

In [51]:
#combine all the features into a single array to prepare for a KNN regression that will enable us to determine
#which states are most alike
feat_array = np.concatenate((demofeat_array[0:51],edufeat_array[0:51], latLong_array[0:51],unemployment_array[:,np.newaxis], urbanRural_array[0:51,np.newaxis]), axis = 1)

In [53]:
#run the k nearest neighbors regression, get the 5 nearest neighbors for each state
nbrs = NearestNeighbors().fit(feat_array)
distances, indices = nbrs.kneighbors(feat_array)

In [57]:
#this loop converts the integer index of a state to a legible name, so we can easily which states are
# like each other
demographicData['Abbreviation'] = demographicData["Geographic Area Name"].map(us.states.mapping('name', 'abbr'))
for state in indices:
    print(demographicData.iloc[state[0]]['Geographic Area Name'] + ': ' + demographicData.iloc[state[1]]['Geographic Area Name']
         + ', ' + demographicData.iloc[state[2]]['Geographic Area Name'] + ', ' + demographicData.iloc[state[3]]['Geographic Area Name'] + ', ' 
          + demographicData.iloc[state[4]]['Geographic Area Name'])

Alabama: South Carolina, Tennessee, Arkansas, Mississippi
Alaska: Oregon, Idaho, Montana, Wyoming
Arizona: Nevada, Texas, California, Utah
Arkansas: Kentucky, Tennessee, Alabama, Oklahoma
California: Nevada, Arizona, Texas, New Mexico
Colorado: Kansas, Nebraska, Minnesota, Oregon
Connecticut: New Hampshire, North Carolina, Virginia, Pennsylvania
Delaware: Pennsylvania, Virginia, Maryland, New York
District of Columbia: Maryland, Virginia, Massachusetts, New York
Florida: Illinois, New York, Delaware, Rhode Island
Georgia: Delaware, Maryland, Illinois, New York
Hawaii: California, Oregon, Nevada, Utah
Idaho: Wyoming, Oregon, Nebraska, Kansas
Illinois: New York, New Jersey, Washington, Rhode Island
Indiana: Kentucky, Michigan, Missouri, Wisconsin
Iowa: Ohio, Nebraska, Wisconsin, Michigan
Kansas: Nebraska, Minnesota, Missouri, Wisconsin
Kentucky: Indiana, Tennessee, Missouri, Arkansas
Louisiana: Alabama, South Carolina, Tennessee, North Carolina
Maine: Vermont, New Hampshire, West Virgini

In [58]:
# the rest of this code exports the KNN into the format that the rest of the model
# expects
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [59]:
knn_names = np.array([['NN']*indices.shape[1]]*indices.shape[0])

In [60]:
knn_names[0][0] = state[1]

In [61]:
for i, row in enumerate(indices):
    for j, column in enumerate(row):
        knn_names[i][j] = states[column]

In [65]:
knn_names = pd.DataFrame(knn_names)

In [66]:
knn_names.to_csv("knnResults.csv", index = False)