In [7]:
!pip install numpy pandas scikit-learn pykrige matplotlib

Defaulting to user installation because normal site-packages is not writeable


In [8]:
import numpy as np
import pandas as pd
import libpysal as lps
from spreg import ML_Lag
import folium
from folium.plugins import MarkerCluster
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import libpysal as lps
from spreg import GM_Lag
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [9]:
# Prepare Dataset
data = pd.read_excel('Dataset.xlsx')

row_count = data.shape[0]
print("Number of rows:", row_count)

print("\n", data.head(5))

y = data['LungCancerRate'].values
X = data[['Smoking', 'Poverty', 'Insurance', 'Income', 'PM 2.5']].values
coords = data[['Latitude', 'Longitude']].values
parish_names = data['Parish']

Number of rows: 3143

    FIPS   Parish   Latitude  Longitude    Income  Insurance    PM 2.5  \
0  1001  Autauga  32.535142 -86.642900  0.308876   0.828652  0.619048   
1  1003  Baldwin  30.727825 -87.722745  0.343421   0.772472  0.455782   
2  1005  Barbour  31.870090 -85.391068  0.151268   0.719101  0.578231   
3  1007     Bibb  32.998376 -87.126814  0.231939   0.755618  0.605442   
4  1009   Blount  33.980871 -86.567006  0.251355   0.707865  0.591837   

    Poverty   Smoking  LungCancerRate  
0  0.161165  0.424242        0.311271  
1  0.135922  0.478788        0.288991  
2  0.403883  0.515152        0.338794  
3  0.316505  0.590909        0.412844  
4  0.198058  0.578788        0.333552  


In [10]:
# Normalize data (explanatory variables X and dependent variable y)
scaler_X, scaler_y = StandardScaler(), StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Create spatial weights matrix using k-Nearest Neighbors (k=15)
knn = lps.weights.KNN.from_array(coords, k=15)
knn.transform = 'R'  # Row-standardization

# Fit the Spatial Autoregressive (SAR) model using GM_Lag
SAR_model = GM_Lag(
    y_scaled, X_scaled, w=knn,
    name_y='LungCancerRate',
    name_x=['Smoking', 'Poverty', 'Insurance', 'Income', 'PM 2.5']
)

# Extract coefficients and make predictions
betas = SAR_model.betas.flatten()
intercept, betas_for_X = betas[0], betas[1:6]
y_pred_scaled = np.dot(X_scaled, betas_for_X) + intercept
y_pred_all = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

# Add predictions to the dataframe
data['Predicted_LungCancerRate'] = y_pred_all

# Print the model summary
print(SAR_model.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :LungCancerRate                Number of Observations:        3143
Mean dependent var  :      0.0000                Number of Variables   :           7
S.D. dependent var  :      1.0002                Degrees of Freedom    :        3136
Pseudo R-squared    :      0.5575
Spatial Pseudo R-squared:  0.4065

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT        -0.00108         0.01189        -0.09060         0.92781
             Smoking         0.25504         0.01828        13.94871         0.00000
             Poverty        -0.00351    

In [11]:
# Calculating accuracy metrics
mae = mean_absolute_error(y, y_pred_all)
r_squared = r2_score(y, y_pred_all)

# Print accuracy metrics
print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'R-squared (R²): {r_squared:.4f}')

Mean Absolute Error (MAE): 0.0680
R-squared (R²): 0.2780


In [12]:
# Plotting the predictions on an interactive map
m = folium.Map(location=[31.0, -92.0], zoom_start=4)
marker_cluster = MarkerCluster().add_to(m)

for idx, row in data.iterrows():
    folium.Marker(
        location=(row['Latitude'], row['Longitude']),
        popup=(
            f"<strong>Parish:</strong> {parish_names[idx]}<br>"
            f"<strong>Predicted Lung Cancer Rate:</strong> {row['Predicted_LungCancerRate']:.2f}"
        ),
        icon=folium.Icon(color='red')
    ).add_to(marker_cluster)

# Displaying the map
m