In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import statsmodels.formula.api as smf

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
rawBostonData = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter02/Dataset/Boston.csv')

In [7]:
#Check for missing values (null values) in the data frame and drop them to have a clean dataset

In [8]:
rawBostonData = rawBostonData.dropna()

In [9]:
#Check for duplicate records in the data frame and drop them to have a clean dataset

In [10]:
#Rename the data frame columns so they are meaningful

In [12]:
renamedBostonData = rawBostonData.rename(columns = {'CRIM':'crimeRatePerCapita', 
  ' ZN ':'landOver25K_sqft', 
  'INDUS ':'non-retailLandProptn', 
  'CHAS':'riverDummy', 
  'NOX':'nitrixOxide_pp10m', 
  'RM':'AvgNo.RoomsPerDwelling', 
  'AGE':'ProptnOwnerOccupied', 
  'DIS':'weightedDist', 
  'RAD':'radialHighwaysAccess', 
  'TAX':'propTaxRate_per10K', 
  'PTRATIO':'pupilTeacherRatio', 
  'LSTAT':'pctLowerStatus', 
  'MEDV':'medianValue_Ks'}) 

In [13]:
renamedBostonData.head()

Unnamed: 0,crimeRatePerCapita,landOver25K_sqft,non-retailLandProptn,riverDummy,nitrixOxide_pp10m,AvgNo.RoomsPerDwelling,ProptnOwnerOccupied,weightedDist,radialHighwaysAccess,propTaxRate_per10K,pupilTeacherRatio,pctLowerStatus,medianValue_Ks
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [14]:
#Divide the data frame into train and test sets, as shown in the following code snippet

In [15]:
X = renamedBostonData.drop('crimeRatePerCapita', axis = 1) 
y = renamedBostonData[['crimeRatePerCapita']] 
seed = 10 
test_data_size = 0.3 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_data_size, random_state = seed) 
train_data = pd.concat([X_train, y_train], axis = 1) 
test_data = pd.concat([X_test, y_test], axis = 1) 

In [16]:
#We choose a test data size of 30%, which is 0.3. The train_test_split function is used to achieve this. We set the seed of the random number generator so that we can obtain a reproducible split each time we run this code. An arbitrary value of 10 is used here. It is good model-building practice to divide a dataset being used to develop a model into at least two parts. One part is used to develop the model and it is called training set (X_train and y_train combined).

In [20]:
#Define a linear regression model and assign it to a variable. Remember to use the log function to transform the dependent variable in the formula string, and also include more than one independent variable. Use **2 to specify the interaction of order 2:

In [18]:
multiLogLinMod = smf.ols(formula=\
'np.log(crimeRatePerCapita) ~ \
(pctLowerStatus + radialHighwaysAccess + medianValue_Ks + nitrixOxide_pp10m)**2',\
data=train_data)

In [19]:
#Call the fit method of the model instance and assign the results of the method to a new variable

In [24]:
multiLogLinModResult=multiLogLinMod.fit()

In [25]:
print(multiLogLinModResult.summary())

                                OLS Regression Results                                
Dep. Variable:     np.log(crimeRatePerCapita)   R-squared:                       0.884
Model:                                    OLS   Adj. R-squared:                  0.881
Method:                         Least Squares   F-statistic:                     261.5
Date:                        Thu, 23 Apr 2020   Prob (F-statistic):          7.79e-154
Time:                                19:41:40   Log-Likelihood:                -394.39
No. Observations:                         354   AIC:                             810.8
Df Residuals:                             343   BIC:                             853.3
Df Model:                                  10                                         
Covariance Type:                    nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

In [26]:
#our model explained 88 percent of the variability in the transformed dependent variable, which is really good

In [27]:
#A good F-value is expected to be larger than one. The model above has an F-statistic value of 261.5, which is larger than one, and a p-value (Prob (F-statistic)) of approximately zero. The risk of making a mistake and rejecting the null hypothesis when we should not (known as a Type I error in hypothesis testing), is less than the 5% limit we chose to live with at the beginning of the hypothesis test. Because the p-value is less than 0.05, we reject the null hypothesis about our model above. Therefore, we state that the model is statistically significant at the chosen 95% confidence level.