In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
data_df = pd.read_excel('FoodAccessResearchAtlasData2019.xlsx')
data_df.head()

In [None]:
mn_df = data_df[data_df['State'] == 'Minnesota']
mn_df

In [None]:
grouped_df = mn_df[["County", "Urban", "Pop2010", "OHU2010",
                            "GroupQuartersFlag", "NUMGQTRS", "PCTGQTRS", "HUNVFlag", "LowIncomeTracts", 
                            "PovertyRate", "MedianFamilyIncome", "TractLOWI", "TractKids", "TractSeniors", 
                            "TractWhite", "TractBlack", "TractAsian", "TractNHOPI", "TractAIAN", 
                            "TractOMultir", "TractHispanic", "TractHUNV", "TractSNAP", "LAhalfand10"]]
grouped_df.head()

In [None]:
grouped_df.isnull().sum()

In [None]:
grouped_df.info()

In [None]:
grouped_df.loc[(grouped_df["MedianFamilyIncome"].isnull()), ["County", "MedianFamilyIncome"]]


In [None]:
median = grouped_df.loc[grouped_df["County"] == "Hennepin County"]["MedianFamilyIncome"].median()
median

In [None]:
grouped_df.loc[(grouped_df["MedianFamilyIncome"].isnull()) & (grouped_df["County"] == "Hennepin County"), "MedianFamilyIncome"] = median

In [None]:
grouped_df.loc[(grouped_df["MedianFamilyIncome"].isnull()) | (grouped_df["County"] == "Hennepin County"), ["County", "MedianFamilyIncome"]]


In [None]:
median2 = grouped_df.loc[grouped_df["County"] == "Washington County"]["MedianFamilyIncome"].median()
median2

In [None]:
grouped_df.loc[(grouped_df["MedianFamilyIncome"].isnull()) & (grouped_df["County"] == "Washington County"), "MedianFamilyIncome"] = median2

In [None]:
median3 = grouped_df.loc[grouped_df["County"] == "Ramsey County"]["MedianFamilyIncome"].median()
median3

In [None]:
grouped_df.loc[(grouped_df["MedianFamilyIncome"].isnull()) & (grouped_df["County"] == "Ramsey County"), "MedianFamilyIncome"] = median3

In [None]:
grouped_df.info()

In [None]:
print(grouped_df.dtypes)

In [None]:
grouped_df.nunique()

In [None]:
# Convert population counts to percentages for demographic observations
grouped_df["PctLOWI"] = grouped_df["TractLOWI"] / grouped_df["Pop2010"]
grouped_df["PctKids"] = grouped_df["TractKids"] / grouped_df["Pop2010"]
grouped_df["PctSeniors"] = grouped_df["TractSeniors"] / grouped_df["Pop2010"]
grouped_df["PctWhite"] = grouped_df["TractWhite"] / grouped_df["Pop2010"]
grouped_df["PctBlack"] = grouped_df["TractBlack"] / grouped_df["Pop2010"]
grouped_df["PctAsian"] = grouped_df["TractAsian"] / grouped_df["Pop2010"]
grouped_df["PctNHOPI"] = grouped_df["TractNHOPI"] / grouped_df["Pop2010"]
grouped_df["PctAIAN"] = grouped_df["TractAIAN"] / grouped_df["Pop2010"]
grouped_df["PctOMultir"] = grouped_df["TractOMultir"] / grouped_df["Pop2010"]
grouped_df["PctHispanic"] = grouped_df["TractHispanic"] / grouped_df["Pop2010"]
grouped_df["PctHUNV"] = grouped_df["TractHUNV"] / grouped_df["Pop2010"]
grouped_df["PctSNAP"] = grouped_df["TractSNAP"] / grouped_df["Pop2010"]



In [None]:
# drop columns with population counts
grouped_df.drop(columns=['TractLOWI','TractKids','TractSeniors','TractWhite','TractBlack','TractAsian','TractNHOPI',
                         'TractAIAN','TractOMultir','TractHispanic','TractHUNV','TractSNAP'], inplace=True)

In [None]:
list(grouped_df.columns)

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`  ***

grouped_df.drop(columns=["County"], inplace=True)
dummies_df = pd.get_dummies(grouped_df)
dummies_df

In [None]:
# Split our preprocessed data into our features and target arrays 
y = dummies_df.LAhalfand10.values
X = dummies_df.drop(columns="LAhalfand10").values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Create a StandardScaler instances ****
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### When the model below was first created and trained, "Accuracy" results were coming back at around 30%. Something had to be incorrect somewhere. First checked the dataset and discovered there were 8 places where "MedianFamilyIncome" was NaN spread across 3 different Minnesota counties. Used the .median() method above to fill in the NaN fields.

#### Also, when looking at the .value_counts() from dummies, typically "0" count should be higher then "1" if nothing else, but even still, the data did not appear to be too balanced, so wasn't sure if that was going to be a challenge.

#### Next, created "First" and "Second" hidden layers with hidden_nodes_layer1 at 800 and ...layer2 at 500 and epochs below at 100. After running evertyhing, "Accuracy" was around 75%-76%! When training, the model did seem to be overfitting, so bumped the epochs down to 10. Also tried adding a "Third" hidden layer with 50 nodes, and "Accuracy" was still coming out at 76%-77%. 

#### Removed the first hidden layer below and also removed the "County" column and then saw "Accuracy" jump up. Lastly, we converted the population from "counts" to "percents" and that brought the "Accuracy" up to .

#### When reviewing with the group, we decided that the population counts would be better being shown as percentages instead of totals. Unsure of how this would affect the model, we did decide to forge ahead and we got our best results yet . . . 85.3%!

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
# hidden_nodes_layer1 =  300
hidden_nodes_layer2 = 100
hidden_nodes_layer3 = 50

nn = tf.keras.models.Sequential()

# First hidden layer
#nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu", input_dim=number_input_features))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
dummies_df["LAhalfand10"].value_counts() #dataset is imbalanced, usually 0 should be higher than 1

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=10)

In [None]:
type(fit_model)

In [None]:
fit_model.history  #plot loss function

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
grouped_df.columns.tolist()

In [None]:
loss_values = fit_model.history['loss']

epochs_range = range(1, len(loss_values) + 1)

plt.plot(epochs_range, loss_values, marker='o', linestyle='-')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.grid(True)
plt.show()