In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# ML stuff
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lm



In [123]:
# what is inmigshare?
data = pd.read_excel("data/Master file Jan23.xls")


In [132]:
def make_dict(data):
    df = data
    df = data[["StateCODE_orig", "StateABRV_orig"]].drop_duplicates()
    return zip(df.StateCODE_orig, df.StateABRV_orig)

In [133]:
def clean(data):
    temp_df = data
    # aggregate % adults in original state
    temp_df["Adults_orig"] = temp_df.loc[:, ["Adults_1925_orig", "Adults_2634_orig", "Adults_3554_orig",
                                            "Adults_5564_orig", "Adults_65_orig"]].sum(axis=1)
    temp_df = temp_df.drop(columns = ["Adults_1925_orig", "Adults_2634_orig", "Adults_3554_orig",
                                            "Adults_5564_orig", "Adults_65_orig"])
    # aggregate % adults in destination state
    temp_df["Adults_dest"] = temp_df.loc[:, ["Adults_1925_dest", "Adults_2634_dest", "Adults_3554_dest",
                                            "Adults_5564_dest", "Adults_65_dest"]].sum(axis=1)
    temp_df = temp_df.drop(columns = ["Adults_1925_dest", "Adults_2634_dest", "Adults_3554_dest",
                                            "Adults_5564_dest", "Adults_65_dest"])

    # drop unneccesary columns
    temp_df = temp_df.drop(columns = ["Homic_death_orig","Homic_death_dest","Unemp_tot_orig","Unemp_tot_dest",
                    "StateNAME_orig","StateNAME_dest","outmigsharepercent", "inmigsharepercent",
                    "PDSI_Rank_orig","PDSI_Rank_dest","PDSI_Anomaly_orig", "PDSI_Anomaly_dest",
                    "Urban_rural_orig","Urban_rural_dest", 'MHV_Nom_orig','MGR_Nom_orig','AGR_Nom_orig',
                    'Average_orig', 'MHV_Nom_dest','MGR_Nom_dest','AGR_Nom_dest','Average_dest',
                    'Dem_Share_President_Two_orig', 'Rep_Share_President_Two_orig',
                    'Dem_Share_President_Two_dest', 'Rep_Share_President_Two_dest',
                    'Health_White_orig','Health_Black_orig','Health_Hispanic_orig','Health_Asian_orig',
                    'Health_Alaska_orig','Other_orig', 'Health_White_dest','Health_Black_dest',
                    'Health_Hispanic_dest','Health_Asian_dest','Health_Alaska_dest','Other_dest'])
    
    # rename poorly named columns
    temp_df = temp_df.rename(columns = {"Children_018_orig": "Children_orig", "Children_018_dest": "Children_dest",
                                       "Econ_Free_Sum_dest": "Econ_Freedom_Score_dest",
                                        "Econ_Free_Sum_orig": "Econ_Freedom_Score_orig",
                                       "Reg_Pri_Par_orig": "Price_Parity_orig", "Reg_Pri_Par_orig": "Price_Parity_dest",
                                       "cdd_orig": "cool_deg_day_orig", "cdd_dest": "cool_deg_day_dest",
                                       "hdd_orig": "heat_deg_day_orig", "hdd_dest": "heat_deg_day_dest"})
    
    # deal with nulls as we go?
    temp_df["Children_orig"] = temp_df["Children_orig"].fillna(0)
    temp_df["Children_dest"] = temp_df["Children_dest"].fillna(0)

    
    return temp_df

In [134]:
# https://towardsdatascience.com/simplest-way-of-creating-a-choropleth-map-by-u-s-states-in-python-f359ada7735e
def make_choropleth(StateABRV):
        temp_df = data[data["StateABRV_orig"] == StateABRV]
        temp_df = temp_df[["Year", "StateABRV_dest", "StateNAME_orig", "outmigshare"]]
        # log transform
        temp_df["outmigshare"] = temp_df["outmigshare"].mask(temp_df["outmigshare"] == 0, np.inf)
        temp_df["Out-Migration Share (Log10 Scale)"] = (np.log10(temp_df["outmigshare"]))
                
        fig = px.choropleth(temp_df,
                    locations='StateABRV_dest', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='Out-Migration Share (Log10 Scale)',
                    color_continuous_scale="Viridis",
                    animation_frame = "Year",
                    range_color = (-6, -1.5),
                    title = temp_df["StateNAME_orig"].unique()[0] + " Out-Migration Share per Year (Log10 Scale)")
        fig.show()


In [159]:

def select_columns(data, *columns):
    """Select only columns passed as arguments."""
    return data.loc[:, columns]

def process_data(data):
    """Process the data for a guided model."""
    
    # Select Features
    data = select_columns(data, 
                          'outmigshare', 
                          'Distance',
                          'Year',
                          'heat_deg_day_orig',
                          'heat_deg_day_dest',
                         )
    

    
    # Return predictors and response variables separately
    X = data.drop(['outmigshare'], axis = 1)
    y = data.loc[:, 'outmigshare']
    
    return X, y


def test(data):
    # Replace infinite updated data with nan
    data.replace([np.inf, -np.inf], 0, inplace=True)
    # Drop rows with NaN
    data.fillna(0, inplace=True)
    """Creates a model to predict XXX with YYY"""
    train, val = train_test_split(data, random_state=42, train_size=0.80)
    X_train, y_train = process_data(train)
    X_val, y_val = process_data(val)
    
    return X_train, y_train, X_val, y_val


In [160]:
cleaned_data = clean(data)
x.columns

Index(['Year', 'StateCODE_orig', 'StateCODE_dest', 'Mig_Estimate',
       'outmigshare', 'inmigshare', 'Distance', 'StateABRV_orig',
       'StateABRV_dest', 'Pop_orig', 'Children_orig', 'Health_Adults_orig',
       'Homicide_Rate_orig', 'Kauf_rne_orig', 'Kauf_ose_orig', 'Kauf_sjc_orig',
       'Kauf_ssr_orig', 'Kauf_zindex_orig', 'Unemp_rate_orig',
       'dems_share_state_leg_orig', 'reps_share_state_leg_orig',
       'dems_control_state_leg_orig', 'reps_control_state_leg_orig',
       'Dem_Share_Prez_orig', 'Rep_Share_Prez_orig',
       'Median_House_Value_adj_orig', 'Median_Gross_Rent_adj_orig',
       'Average_Gross_Rent_adj_orig', 'GoodDays_orig', 'Inc_Corp_Tax_orig',
       'StateGDP_Millions_orig', 'PDSI_Value_orig', 'Price_Parity_dest',
       'Econ_Freedom_Score_orig', 'Homeown_Perc_orig', 'Median_HHI_orig',
       'Median_HHI_adj_orig', 'Coll_Educ_orig', 'heat_deg_day_orig',
       'cool_deg_day_orig', 'Pop_dest', 'Children_dest', 'Health_Adults_dest',
       'Homicide_Rate_

In [162]:
# Attempt at modeling stuff

X_train, y_train, X_val, y_val = test(cleaned_data)


linear_model = lm.LinearRegression()
linear_model.fit(X_train, y_train)
# print(X_train)
# print("")
# print(y_train)

print(y_train.shape)
print(X_train.shape)


training_accuracy = linear_model.score(X_train, Y_train)
print("Training Accuracy: ", training_accuracy)


(33320,)
(33320, 4)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').