# MULTIPLE LINEAR REGRESSION MODEL

# Libraries and data

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("cancer_reg_refined.csv", encoding='latin-1')
cancer_df = data.copy()
cancer_df.head(2)
# Without the encoding paremeter, this error presents itself:
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 41137: invalid continuation byte

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,TARGET_deathRate,binnedInc,Geography
0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,164.9,BIN_5,WEST
1,173.0,411.6,23.111,2.34,6.1,22.4,7.8,0.969,2.246,3.741,4.333,161.3,BIN_4,WEST


# Multilinear Regression 

In [3]:
# Getting Dummy variables, dropping one to avoid the dummy variable trap
cancer_df = pd.get_dummies(data = cancer_df, drop_first = True)
cancer_df.head(2)

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,TARGET_deathRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,164.9,0,0,0,1,0,1
1,173.0,411.6,23.111,2.34,6.1,22.4,7.8,0.969,2.246,3.741,4.333,161.3,0,0,1,0,0,1


In [4]:
# Isolating x and y
y = cancer_df['TARGET_deathRate']
x = cancer_df.drop(columns = ['TARGET_deathRate'])
x.head(1)

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,0,0,0,1,0,1


In [5]:
# Adding constant
x = sm.add_constant(x)
x.head(1)

Unnamed: 0,const,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
0,1.0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,0,0,0,1,0,1


In [6]:
# Training and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 666 )
x_train.head(2)

Unnamed: 0,const,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
781,1.0,782.0,457.0,73.537,2.5,18.0,33.7,8.5,10.273,1.669,0.653,5.939,0,0,0,1,1,0
1210,1.0,58.0,405.4,0.0,2.44,15.6,13.1,9.7,0.662,0.441,1.537,6.036,1,0,0,0,0,1


In [7]:
model = sm.OLS(y_train, x_train).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       TARGET_deathRate   R-squared:                       0.481
Model:                            OLS   Adj. R-squared:                  0.476
Method:                 Least Squares   F-statistic:                     115.7
Date:                Fri, 10 Feb 2023   Prob (F-statistic):          7.04e-287
Time:                        20:16:03   Log-Likelihood:                -9462.3
No. Observations:                2144   AIC:                         1.896e+04
Df Residuals:                    2126   BIC:                         1.906e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   67.5602 

In [8]:
# Adj. R-squared: 0.476
    # The model is a little bit underfitting

    # Some variables are not statistically significant (P>|t| >> 0)  

    # One of the Notes is :
        # The condition number is large, 9.94e+03. This might indicate that there are strong 
        # multicollinearity or other numerical problems.
    
        # This is a scaling issue, not a multicollinearity problem.

# Assessing Model Accuracy

In [9]:
predictions = model.predict(x_test)
predictions

2353    190.170072
1584    167.676125
2082    179.528444
212     165.820486
225     195.801794
           ...    
2277    188.576067
2548    176.869089
2631    168.313157
2529    203.908136
551     162.783660
Length: 536, dtype: float64

In [10]:
# Accuracy Assessment
from sklearn.metrics import mean_absolute_error, mean_squared_error
print("MAE:", round(mean_absolute_error(y_test, predictions),2))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, predictions)),2))
MAPE = mean_absolute_error(y_test, predictions)/np.average(y_test)
print('MAPE:', round(MAPE * 100,3),'%')

MAE: 15.36
RMSE: 20.53
MAPE: 8.591 %
