In [34]:
%matplotlib inline

import pandas as pd
import numpy as np
from IPython.display import display

pd.options.display.max_columns = 100

cars_data = pd.read_csv('cars_csv.csv')

display(cars_data.sample(5))

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
77,2,161,mitsubishi,gas,std,two,hatchback,fwd,front,93.7,157.3,64.4,50.8,1944,ohc,four,92,2bbl,2.97,3.23,9.4,68,5500,31,38,6189
23,1,118,dodge,gas,turbo,two,hatchback,fwd,front,93.7,157.3,63.8,50.8,2128,ohc,four,98,mpfi,3.03,3.39,7.6,102,5500,24,30,7957
106,1,231,nissan,gas,std,two,hatchback,rwd,front,99.2,178.5,67.9,49.7,3139,ohcv,six,181,mpfi,3.43,3.27,9.0,160,5200,19,25,18399
32,1,101,honda,gas,std,two,hatchback,fwd,front,93.7,150.0,64.0,52.6,1837,ohc,four,79,1bbl,2.91,3.07,10.1,60,5500,38,42,5399
50,1,104,mazda,gas,std,two,hatchback,fwd,front,93.1,159.1,64.2,54.1,1890,ohc,four,91,2bbl,3.03,3.15,9.0,68,5000,30,31,5195


Let's start by running a regression on the effect of 'number of doors' on 'horsepower' without any controls:

In [35]:
#get relevant columns
reg_data = cars_data.ix[:, ['num-of-doors', 'horsepower']]

#clean
reg_data.replace('?', np.nan, inplace = True)
reg_data.dropna(inplace = True)

#Dummify
reg_data = pd.get_dummies(reg_data, columns=['num-of-doors'], drop_first=True)

#sanity check
display(reg_data.sample(10))

Unnamed: 0,horsepower,num-of-doors_two
188,100,0.0
51,68,1.0
160,70,0.0
45,70,0.0
186,85,0.0
4,115,0.0
150,62,1.0
15,182,0.0
137,160,0.0
133,110,0.0


In [36]:


from sklearn.linear_model import LinearRegression

#train a model
lr = LinearRegression()
lr.fit(reg_data.drop('horsepower', 1), reg_data.ix[:, 'horsepower'])

#get the coefficients:
beta = lr.coef_[0]
print("Beta = {}".format(beta))
print("Score = {}".format(lr.score(reg_data.drop('horsepower', 1), reg_data.ix[:, 'horsepower'])))





Beta = 10.002413515687854
Score = 0.015615717168255938


In [37]:
#Now let's control for 'body-style' and 'make':

#get relevant columns
reg_data = cars_data.ix[:, ['num-of-doors', 'aspiration', 'horsepower']]

#clean
reg_data.replace('?', np.nan, inplace = True)
reg_data.dropna(inplace = True)

#Dummify
reg_data = pd.get_dummies(reg_data, columns=['num-of-doors', 'aspiration'], drop_first=True)

#sanity check
display(reg_data.sample(10))

Unnamed: 0,horsepower,num-of-doors_two,aspiration_turbo
40,86,0.0,0.0
157,70,0.0,0.0
154,62,0.0,0.0
14,121,0.0,0.0
189,90,1.0,0.0
34,76,1.0,0.0
182,52,1.0,0.0
140,73,1.0,0.0
186,85,0.0,0.0
144,82,0.0,0.0


In [38]:
#train a model
lr = LinearRegression()
lr.fit(reg_data.drop('horsepower', 1), reg_data.ix[:, 'horsepower'])

#get the coefficients:
var_coefs = dict(zip(reg_data.drop('horsepower', 1).columns, lr.coef_))
display(var_coefs)
print('Intercept: {}'.format(lr.intercept_))
print("Score = {}".format(lr.score(reg_data.drop('horsepower', 1), reg_data.ix[:, 'horsepower'])))

{'aspiration_turbo': 25.730490649425043, 'num-of-doors_two': 10.91840242255596}

Intercept: 95.07901951958097
Score = 0.07721156267073681


In [39]:
#This is incorrect
##############################

#Calculate the Omitted Variable Bias;
OBV = beta - var_coefs['num-of-doors_two']
print(OBV)

-0.915988906868


Let's consider the very simple linear regression case from the book;

In [40]:
A = [1,1,1,0,0]
P = pd.DataFrame({'P' : [1,1,0,1,0]})

lr = LinearRegression()
lr.fit(P, A)
print(lr.coef_, lr.intercept_)

[ 0.16666667] 0.5


Let's have a play around with some different data.