# Linear regression

## Get data

In [1]:
from sklearn.datasets import load_boston

In [2]:
import  numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:
dataSet = load_boston()

In [4]:
print dataSet.keys()

['data', 'feature_names', 'DESCR', 'target']


In [5]:
print dataSet.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [6]:
print dataSet.data.shape

(506, 13)


In [7]:
print dataSet.feature_names

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [8]:
print dataSet.target.shape

(506,)


In [9]:
print np.max(dataSet.target), np.min(dataSet.target), np.mean(dataSet.target)

50.0 5.0 22.5328063241


In [10]:
dataSet.target = dataSet.target.reshape(-1,1)

## Split data and normalize it 

In [11]:
from sklearn.cross_validation import train_test_split

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
X_train, X_test, y_train, y_test = train_test_split(dataSet.data, dataSet.target, test_size=0.25, random_state=33)

In [14]:
print y_train.shape, y_test.shape

(379, 1) (127, 1)


In [15]:
X_standardScaler = StandardScaler().fit(X_train)

In [16]:
y_standardScaler = StandardScaler().fit(y_train)

In [17]:
X_train = X_standardScaler.transform(X_train)
X_test = X_standardScaler.transform(X_test)

In [18]:
y_train = y_standardScaler.transform(y_train)
y_test = y_standardScaler.transform(y_test)

In [19]:
y_train = y_train.flatten()
y_test = y_test.flatten()

In [20]:
print y_train.shape

(379,)


## Evaluate model

In [21]:
from sklearn.cross_validation import KFold, cross_val_score

In [22]:
def train_evaluate(model, X_train, y_train,K):
    '''Use cross-validation to evaluate model'''
    
    model = model.fit(X_train, y_train)
    print "R2 on training:"
    print model.score(X_train, y_train)
    # k-fold cv
    cv = KFold(X_train.shape[0], K, shuffle=True, random_state=33)
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    print "Average R2 using {}-fold cross-validation: ".format(K), np.mean(scores)

### Linear model

In [23]:
from sklearn.linear_model import SGDRegressor

**normal linear regression**

In [24]:
sgdRegressor = SGDRegressor(loss='squared_loss', penalty=None, random_state=42)

In [25]:
train_evaluate(sgdRegressor, X_train, y_train, 5)

R2 on training:
0.743617732983
Average R2 using 5-fold cross-validation:  0.710809853468


**ridge regression**

In [26]:
ridgeRegressor = SGDRegressor(loss="squared_loss", penalty='l2', random_state=42)

In [27]:
train_evaluate(ridgeRegressor, X_train, y_train, 5)

R2 on training:
0.743616743208
Average R2 using 5-fold cross-validation:  0.71081206667


In [28]:
print sum(ridgeRegressor.coef_ == 0)

0


**lasso regression**

In [57]:
lassoRegressor = SGDRegressor(loss="squared_loss", penalty="l1", random_state=42, alpha=0.005)

In [58]:
train_evaluate(lassoRegressor, X_train, y_train, 5)

R2 on training:
0.741824162315
Average R2 using 5-fold cross-validation:  0.708769237443


In [31]:
print sum(lassoRegressor.coef_ == 0)

0


## SVM for regression

In [32]:
from sklearn import svm

### Kernel-linear

In [33]:
svmLinearKernel = svm.SVR(kernel='linear')

In [34]:
train_evaluate(svmLinearKernel, X_train, y_train,5)

R2 on training:
0.71886923342
Average R2 using 5-fold cross-validation:  0.707838419194


### Kernel-poly

In [35]:
svmPolyKernel = svm.SVR(kernel='poly')

In [36]:
train_evaluate(svmPolyKernel, X_train, y_train, 5)

R2 on training:
0.904109273301
Average R2 using 5-fold cross-validation:  0.779288545488


### Kernel-rbf

In [37]:
svmRBFKernel = svm.SVR(kernel='rbf')

In [38]:
train_evaluate(svmRBFKernel, X_train, y_train, 5)

R2 on training:
0.900132065979
Average R2 using 5-fold cross-validation:  0.833662221567


## Random forest

**Extra tree**

In [39]:
from sklearn.ensemble import ExtraTreesRegressor

In [40]:
extraTreesRegressor = ExtraTreesRegressor(n_estimators=10,random_state=42)

In [41]:
train_evaluate(extraTreesRegressor, X_train, y_train, 5)

R2 on training:
1.0
Average R2 using 5-fold cross-validation:  0.861758978344


In [42]:
print np.sort(zip(extraTreesRegressor.feature_importances_, dataSet.feature_names), axis=0)

[['0.00504385320276' 'AGE']
 ['0.0151425137151' 'B']
 ['0.0170525784005' 'CHAS']
 ['0.0189418210858' 'CRIM']
 ['0.0236025617776' 'DIS']
 ['0.0257330490046' 'INDUS']
 ['0.0318741622351' 'LSTAT']
 ['0.0344056449393' 'NOX']
 ['0.0397131333452' 'PTRATIO']
 ['0.0466185213973' 'RAD']
 ['0.0995118014928' 'RM']
 ['0.284215227964' 'TAX']
 ['0.35814513144' 'ZN']]


## Evaluation

In [43]:
from sklearn import metrics

In [44]:
def measure_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print "R2:{0:.3f}".format(metrics.r2_score(y_test, y_pred))

In [46]:
measure_performance(sgdRegressor, X_test, y_test)

R2:0.650


In [47]:
measure_performance(ridgeRegressor, X_test, y_test)

R2:0.650


In [48]:
measure_performance(lassoRegressor, X_test,y_test)

R2:0.650


In [49]:
measure_performance(svmLinearKernel, X_test, y_test)
measure_performance(svmPolyKernel, X_test, y_test)
measure_performance(svmRBFKernel, X_test, y_test)

R2:0.652
R2:0.404
R2:0.756


In [50]:
measure_performance(extraTreesRegressor, X_test, y_test)

R2:0.802
