# Class notes week 7

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf

## Linear regression

In [5]:
titanic = sns.load_dataset("titanic")

In [19]:
est = smf.ols("survived ~C(pclass) + sex", data=titanic).fit()
est.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9212,0.031,30.118,0.000,0.861,0.981
C(pclass)[T.2],-0.1454,0.039,-3.739,0.000,-0.222,-0.069
C(pclass)[T.3],-0.3140,0.032,-9.849,0.000,-0.377,-0.251
sex[T.male],-0.5163,0.027,-18.814,0.000,-0.570,-0.462


In [15]:
# ceteris paritus: holding all other factors constant

### Interactions

In [17]:
est = smf.ols("survived ~C(pclass)*sex", data=titanic).fit() #interactions with *
est.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9681,0.039,24.700,0.000,0.891,1.045
C(pclass)[T.2],-0.0470,0.059,-0.802,0.423,-0.162,0.068
C(pclass)[T.3],-0.4681,0.050,-9.290,0.000,-0.567,-0.369
sex[T.male],-0.5992,0.052,-11.490,0.000,-0.702,-0.497
C(pclass)[T.2]:sex[T.male],-0.1644,0.077,-2.130,0.033,-0.316,-0.013
C(pclass)[T.3]:sex[T.male],0.2347,0.064,3.648,0.000,0.108,0.361


In [18]:
# explosion of coefficients because of high number of possible interactions 

In [20]:
#surviva prob for:
#m P1 -> 0.968 - 0.5992 = 0.368
#m P3 -> 0.968 - 0.4681 - 0.0.5992 + 0.2347 = 0.13 # relative penalty for being a male in third class is not as high as in other classes, but overall surv prob is lower


In [None]:
#homoscedasticity: equal variance (constant) -> we can work with std errors
#heteroscedasticity -> variance depends on X 

# especially relevant in econometrics
#heavy assumptions/ ruoles which need to be furfilled in order so regression is true

# Overfitting

In [21]:
# Model flexibility = number of degrees of freedom
# Adjusted R2 = Attempt to reduce heavy overfitting: 1- (RSS/n-p-1)/(TSS/n-1) --> "better" measure but not perfect

**Train-test split**
- train(large) - test (small): Hard to get "high variance" 
- train(small) - test (large): Difficult to get a good model 

Rule of thumb: 70-30

**Cross-validation**
- K-fold cross validation (fixed test window) 
- Most common: 10 fold cv
- You do not "loose" train data due to folds
- you use cv only to look parameters 


In [23]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

boston = pd.read_csv('../data/boston.csv')

# independent variables
X = boston.drop('medv', axis=1)

# dependent variable
y = boston['medv']

### Holdout 

In [26]:
from sklearn.model_selection import train_test_split

# split the data into train and test
train_data, test_data = train_test_split(boston, test_size=0.8)

# Ready Xs and ys
X_train = train_data.drop('medv', axis=1)
y_train = train_data['medv']

X_test = test_data.drop('medv', axis=1)
y_test = test_data['medv']

In [28]:
# Instanciate model
model1 = LinearRegression().fit(X_train, y_train)

In [30]:
r2_holdout = model1.score(X_test, y_test)
r2_holdout

0.6550477197799878

### Cross-validation

In [53]:
from sklearn.utils import shuffle
X, y = shuffle(X, y)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
152,1.12658,0.0,19.58,1,0.871,5.012,88.0,1.6102,5,403,14.7,343.28,12.12,28.4
165,2.92400,0.0,19.58,0,0.605,6.101,93.0,2.2834,5,403,14.7,240.16,9.81,25.0
151,1.49632,0.0,19.58,0,0.871,5.404,100.0,1.5916,5,403,14.7,341.60,13.28,21.6
55,0.01311,90.0,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81,45.4
84,0.05059,0.0,4.49,0,0.449,6.389,48.0,4.7794,3,247,18.5,396.90,9.62,18.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,21.0
353,0.01709,90.0,2.02,0,0.410,6.728,36.1,12.1265,5,187,17.0,384.46,4.50,29.0
335,0.03961,0.0,5.19,0,0.515,6.037,34.5,5.9853,5,224,20.2,396.90,8.01,37.0
97,0.12083,0.0,2.89,0,0.445,8.069,76.0,3.4952,2,276,18.0,396.90,4.21,25.0


In [37]:
from sklearn.model_selection import cross_validate
# Instanciate model
model = LinearRegression()

np.random.shuffle()
# 5-Fold Cross validate model
cv_results = cross_validate(model, X, y, cv=5)


AttributeError: 'DataFrame' object has no attribute 'shuffle'

In [35]:
print(cv_results["test_score"])
print(cv_results["test_score"].mean())

[ 0.63919994  0.71386698  0.58702344  0.07923081 -0.25294154]
0.35327592439588434


# Cross-Validation