# Fitting a Log-Linear Model Using the statsmodels formula API

<b> Define a linear regression model and assign it to a variable. Remember to use the log function to transform the dependent variable in the formula string </b>

In [1]:
# import packages
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import patsy
from statsmodels.graphics.correlation import plot_corr
from sklearn.model_selection import train_test_split
plt.style.use('seaborn')

In [2]:
# load the data
rawBostonData = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter02/Dataset/Boston.csv')
rawBostonData.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [3]:
# deleting duplicate records
rawBostonData = rawBostonData.drop_duplicates()

In [4]:
# Rename the DataFrame columns so that they are meaningful
renamedBostonData = rawBostonData.rename(columns = {'CRIM':'crimeRatePerCapita',
                                                    ' ZN ':'landOver25K_sqft',
                                                    'INDUS ':'non-retailLandProptn',
                                                    'CHAS':'riverDummy',
                                                    'NOX':'nitrixOxide_pp10m',
                                                    'RM':'AvgNo.RoomsPerDwelling',
                                                    'AGE':'ProptnOwnerOccupied',
                                                    'DIS':'weightedDist',
                                                    'RAD':'radialHighwaysAccess',
                                                    'TAX':'propTaxRate_per10K',
                                                    'PTRATIO':'pupilTeacherRatio',
                                                    'LSTAT':'pctLowerStatus',
                                                    'MEDV':'medianValue_Ks'})

In [5]:
# Divide the DataFrame into training and test sets
X = renamedBostonData.drop('crimeRatePerCapita', axis=1)
y = renamedBostonData[['crimeRatePerCapita']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [7]:
import statsmodels.formula.api as smf
# define a linear regression model 
logLinearModel = smf.ols(formula='np.log(crimeRatePerCapita) ~ medianValue_Ks', data=train_data)

<b> Call the fit method of the log-linear model instance and assign the results of the method to a variable </b>

In [8]:
# fit the model
logLinearModelResult = logLinearModel.fit()

<b> Print a summary of the results and analyze the output </b>

In [9]:
# Print a summary of the model
print(logLinearModelResult.summary())

                                OLS Regression Results                                
Dep. Variable:     np.log(crimeRatePerCapita)   R-squared:                       0.238
Model:                                    OLS   Adj. R-squared:                  0.236
Method:                         Least Squares   F-statistic:                     109.9
Date:                        Mon, 22 Feb 2021   Prob (F-statistic):           1.48e-22
Time:                                09:46:35   Log-Likelihood:                -727.67
No. Observations:                         354   AIC:                             1459.
Df Residuals:                             352   BIC:                             1467.
Df Model:                                   1                                         
Covariance Type:                    nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

# Fitting a Multiple Log-Linear Regression Model

<b> Define a linear regression model and assign it to a variable. Remember to use the log function to transform the dependent variable in the formula string, and also include more than one independent variable in your analysis </b>

In [10]:
multiLogLinearModel = smf.ols(formula='np.log(crimeRatePerCapita) ~ (pctLowerStatus + radialHighwaysAccess + medianValue_Ks + nitrixOxide_pp10m)**2',
                             data=train_data)

<b> Call the fit method of the model instance and assign the results of the method to a new variable </b>

In [11]:
multiLogLinearModelResult = multiLogLinearModel.fit()

<b> Print a summary of the results and analyze your model </b>

In [12]:
print(multiLogLinearModelResult.summary())

                                OLS Regression Results                                
Dep. Variable:     np.log(crimeRatePerCapita)   R-squared:                       0.884
Model:                                    OLS   Adj. R-squared:                  0.881
Method:                         Least Squares   F-statistic:                     261.5
Date:                        Mon, 22 Feb 2021   Prob (F-statistic):          7.79e-154
Time:                                09:46:36   Log-Likelihood:                -394.39
No. Observations:                         354   AIC:                             810.8
Df Residuals:                             343   BIC:                             853.3
Df Model:                                  10                                         
Covariance Type:                    nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------