In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the data
df = pd.read_csv('../Resources_Tables/FINAL_opioid_data_UNCLEAN.csv)
df.dropna(inplace=True)

In [3]:
# Split into two groups: one with non-suppressed data, and another with only suppressed data

df_non_suppressed = df[df['deaths'] != 'Suppressed']
df_suppressed = df[df['deaths'] == 'Suppressed']

In [4]:
# Calculate mean and standard deviation for non-suppressed group

mean_deaths = df_non_suppressed['deaths'].astype(float).mean()
std_deaths = df_non_suppressed['deaths'].astype(float).std()
mean_cruderate = df_non_suppressed['cruderate'].astype(float).mean()
std_cruderate = df_non_suppressed['cruderate'].astype(float).std()

In [5]:
# Impute suppressed values by sampling from normal distribution

imputed_deaths = np.random.normal(loc=mean_deaths, scale=std_deaths, size=len(df_suppressed))
imputed_cruderate = np.random.normal(loc=mean_cruderate, scale=std_cruderate, size=len(df_suppressed))

In [6]:
# Replace suppressed values with imputed values

df_suppressed.loc[:, 'deaths'] = imputed_deaths
df_suppressed.loc[:, 'cruderate'] = imputed_cruderate

In [7]:
# Merge the two groups back together

df_imputed = pd.concat([df_non_suppressed, df_suppressed])

In [8]:
# Convert 'deaths' and 'cruderate' columns to float type

df_imputed['deaths'] = df_imputed['deaths'].astype(float)
df_imputed['cruderate'] = df_imputed['cruderate'].astype(float)

In [9]:
# Drop unnecessary columns

df_imputed.drop(['PovertyRate', 'UnemployedRate', 'cruderate'], axis=1, inplace=True)

In [10]:
# Df

df_imputed.head()

Unnamed: 0,Year,countycode,State,county,population,deaths,DispenseRate,PersonalIncome,PerCapitaPersonalIncome,LessThanHS,HSDiplomaOnly,SomeCollege,BachelorOrHigher,PovertyPop,UnemployedPop
1,2020,1003,AL,"Baldwin County, AL",229287,41.0,65.0,51348.0,233140.0,20635.83,62824.64,71537.54,74518.28,20189.0,6159.0
27,2020,1055,AL,"Etowah County, AL",102371,13.0,102.8,39795.0,103393.0,14127.2,34601.4,35215.62,18426.78,15650.0,3505.0
36,2020,1073,AL,"Jefferson County, AL",655342,227.0,139.0,58053.0,673341.0,58980.78,173010.29,195947.26,227403.67,92310.0,21947.0
44,2020,1089,AL,"Madison County, AL",379453,68.0,87.3,56499.0,389696.0,30356.24,72854.98,108144.11,168097.68,39100.0,9761.0
48,2020,1097,AL,"Mobile County, AL",412716,46.0,98.0,42837.0,414250.0,49938.64,141148.87,122576.65,98639.12,71398.0,16931.0


In [11]:
# Apply one-hot encoding to 'State' and 'county' columns

df_encoded = pd.get_dummies(df_imputed, columns=['State', 'county'])

In [12]:
# Split the dataset into input features (X) and target variable (y)

X = df_encoded.drop('deaths', axis=1)
y = df_encoded['deaths']

In [13]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Scale the features using StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Initialize the MLPRegressor with max_iter=500

mlp = MLPRegressor(hidden_layer_sizes=(64, 64), activation='relu', solver='adam', random_state=42, max_iter=500)


# Fit the MLPRegressor to the training data
mlp.fit(X_train_scaled, y_train)

# Predict the target variable using the trained MLPRegressor
y_pred_mlp = mlp.predict(X_test_scaled)

In [16]:
# Calculate R-squared score, Mean Squared Error, and Root Mean Squared Error for the MLPRegressor
mlp_r2 = r2_score(y_test, y_pred_mlp)
mlp_mse = mean_squared_error(y_test, y_pred_mlp)
mlp_rmse = np.sqrt(mlp_mse)

print(f"MLPRegressor R-squared score: {mlp_r2:.3f}")
print(f"MLPRegressor Mean Squared Error: {mlp_mse:.3f}")
print(f"MLPRegressor Root Mean Squared Error: {mlp_rmse:.3f}")

MLPRegressor R-squared score: -0.593
MLPRegressor Mean Squared Error: 16012.284
MLPRegressor Root Mean Squared Error: 126.540
