## 线性回归
## 1. 我们先使用sklearn工具包进行线性回归

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, metrics # note: sklearn is not enough for linear regression, 
                                                    # will introduce more packages later 

### Pathway to get data:
1. UCI datasets -- usually 10-100 MB, small, suitable for practice in early stage
2. Kaggle datasets -- big industrial datasets, usually use cloud computations 
3. 国内: 天池, 腾讯, 百度, 京东, datacastle( third party)

### 导入数据


In [19]:
# 导入数据
boston = datasets.load_boston() # Boston cost of real estate 房价

In [4]:
X = boston.data #影响因子
y = boston.target # 1D array, cost of real estate

### 使用的自变量有哪些？

In [5]:
# 使用的自变量有哪些？
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [6]:
boston

 'data': array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00, ...,
           1.53000000e+01,   3.96900000e+02,   4.98000000e+00],
        [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00, ...,
           1.78000000e+01,   3.96900000e+02,   9.14000000e+00],
        [  2.72900000e-02,   0.00000000e+00,   7.07000000e+00, ...,
           1.78000000e+01,   3.92830000e+02,   4.03000000e+00],
        ..., 
        [  6.07600000e-02,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.96900000e+02,   5.64000000e+00],
        [  1.09590000e-01,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.93450000e+02,   6.48000000e+00],
        [  4.74100000e-02,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.96900000e+02,   7.88000000e+00]]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'],
       dtype='<U7'),
 'target': array([ 24. ,  21.6, 

In [7]:
boston.DESCR



### 分离训练集与测试集

In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=1)



### 建立线性回归对象

In [9]:
lm = linear_model.LinearRegression()

### 训练，拟合

In [10]:
lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### 查看参数

In [11]:
lm.coef_

array([ -9.71284334e-02,   6.07284394e-02,   5.96370092e-02,
         2.44352809e+00,  -2.14995617e+01,   2.78993012e+00,
         3.66229013e-03,  -1.51568850e+00,   3.06819943e-01,
        -1.12697744e-02,  -1.00652372e+00,   6.56942407e-03,
        -5.69755469e-01])

## 2.使用统计包 statsmodels (more professional in data analysis)

In [12]:
import statsmodels.api as sm

  from pandas.core import datetools


In [15]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [20]:
df = pd.DataFrame(boston.data,columns=boston.feature_names)  #X = boston.data; y = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [24]:
y = pd.DataFrame(boston.target, columns=['MEDV']) #redefine y, to extract series from df
y.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [26]:
X = df[['RM','CRIM']] #redefine X

# KEY!!!!!! using sm, must MANUALLY ADD a constant(intercept) !!!!!!
X = sm.add_constant(X) #--> in this case, y=beta0+ beta1*x1+beta2*x2+...
model = sm.OLS(y,X).fit()

### 查看结果

In [27]:
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.541
Model:,OLS,Adj. R-squared:,0.539
Method:,Least Squares,F-statistic:,295.9
Date:,"Fri, 01 Jun 2018",Prob (F-statistic):,1.15e-85
Time:,13:24:34,Log-Likelihood:,-1643.5
No. Observations:,506,AIC:,3293.0
Df Residuals:,503,BIC:,3306.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-29.3017,2.592,-11.303,0.000,-34.395,-24.208
RM,8.3975,0.406,20.706,0.000,7.601,9.194
CRIM,-0.2618,0.033,-7.899,0.000,-0.327,-0.197

0,1,2,3
Omnibus:,170.471,Durbin-Watson:,0.805
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1034.461
Skew:,1.331,Prob(JB):,2.3399999999999998e-225
Kurtosis:,9.479,Cond. No.,92.2
