# 一元线性回归分析的基本理论

## 简单的一元OLS回归

动量效应（Momentum effect）一般又称“惯性效应”。动量效应是由Jegadeesh和Titman（1993）提出的，是指股票的收益率有延续原来的运动方向的趋势，即过去一段时间收益率较高的股票在未来获得的收益率仍会高于过去收益率较低的股票。

在下面的模型里面，我们考虑中国股票市场收益率在时间层面上的动量效应。

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #画图模块
from datetime import datetime #时间模块
import scipy.stats as stats # 统计模块
import scipy
from IPython.core.interactiveshell import InteractiveShell
import statsmodels.formula.api as smf  # OLS regression

#输出矢量图 渲染矢量图 魔法函数（Magic Functions）内嵌绘图
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

#显示每一个运行结果
InteractiveShell.ast_node_interactivity = 'all'

#输出或者打印的时候，不限制列或者行
pd.set_option('display.max_columns',None)

In [20]:
data = pd.read_csv('C:/Users/hp/Desktop/Python/Python-4/000001.csv')
data['Day'] = pd.to_datetime(data['Day'], format = '%Y-%m-%d')
data.set_index('Day', inplace = True)
data.sort_values(by = ['Day'], axis=0, ascending=True)
data

Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-12-19,,96.05,99.98,95.79,99.98,126000,4.940000e+05
1990-12-20,99.98,104.30,104.39,99.98,104.39,19700,8.400000e+04
1990-12-21,104.39,109.07,109.13,103.73,109.13,2800,1.600000e+04
1990-12-24,109.13,113.57,114.55,109.13,114.55,3200,3.100000e+04
1990-12-25,114.55,120.09,120.25,114.55,120.25,1500,6.000000e+03
...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11


Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-12-19,,96.05,99.98,95.79,99.98,126000,4.940000e+05
1990-12-20,99.98,104.30,104.39,99.98,104.39,19700,8.400000e+04
1990-12-21,104.39,109.07,109.13,103.73,109.13,2800,1.600000e+04
1990-12-24,109.13,113.57,114.55,109.13,114.55,3200,3.100000e+04
1990-12-25,114.55,120.09,120.25,114.55,120.25,1500,6.000000e+03
...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11


In [21]:
daily_data = data['1995-01':'2022-07'].copy()
daily_data['Close'] = pd.to_numeric(daily_data['Close'])
daily_data['Preclose'] = pd.to_numeric(daily_data['Preclose'])
daily_data['Raw_return'] = daily_data['Close'] / daily_data['Preclose'] - 1
daily_data['Log_return'] = np.log(daily_data['Close']) - np.log(daily_data['Preclose'])
daily_data

Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money,Raw_return,Log_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995-01-03,647.87,637.72,647.71,630.53,639.88,23451800,1.806930e+08,-0.012333,-0.012409
1995-01-04,639.88,641.90,655.51,638.86,653.81,42222000,3.069230e+08,0.021770,0.021536
1995-01-05,653.81,655.38,657.52,645.81,646.89,43012300,3.015330e+08,-0.010584,-0.010641
1995-01-06,646.89,642.75,643.89,636.33,640.76,48748200,3.537580e+08,-0.009476,-0.009521
1995-01-09,640.76,637.52,637.55,625.04,626.00,50985100,3.985190e+08,-0.023035,-0.023305
...,...,...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11,-0.005988,-0.006006
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11,0.008322,0.008288
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11,-0.000513,-0.000513
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11,0.002082,0.002080


In [22]:
Month_data = daily_data.resample('M')['Log_return'].sum().to_frame()  
Month_data['Raw_return'] = np.exp(Month_data['Log_return'])-1
Month_data

Unnamed: 0_level_0,Log_return,Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1995-01-31,-0.141139,-0.131631
1995-02-28,-0.023979,-0.023694
1995-03-31,0.163651,0.177803
1995-04-30,-0.109315,-0.103552
1995-05-31,0.188901,0.207922
...,...,...
2022-03-31,-0.062604,-0.060685
2022-04-30,-0.065154,-0.063077
2022-05-31,0.044724,0.045739
2022-06-30,0.064468,0.066592


In [23]:
Quarter_data = daily_data.resample('Q')['Log_return'].sum().to_frame()  
Quarter_data['Raw_return'] = np.exp(Quarter_data['Log_return'])-1
Quarter_data

Unnamed: 0_level_0,Log_return,Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1995-03-31,-0.001467,-0.001466
1995-06-30,-0.025583,-0.025258
1995-09-30,0.135980,0.145660
1995-12-31,-0.263130,-0.231358
1996-03-31,0.001979,0.001981
...,...,...
2021-09-30,-0.006434,-0.006413
2021-12-31,0.019870,0.020069
2022-03-31,-0.112592,-0.106484
2022-06-30,0.044038,0.045022


In [24]:
Year_data = daily_data.resample('Y')['Log_return'].sum().to_frame()  
Year_data['Raw_return'] = np.exp(Year_data['Log_return'])-1
Year_data

Unnamed: 0_level_0,Log_return,Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1995-12-31,-0.1542,-0.142899
1996-12-31,0.501639,0.651425
1997-12-31,0.264019,0.302153
1998-12-31,-0.040505,-0.039695
1999-12-31,0.175423,0.19175
2000-12-31,0.416917,0.517277
2001-12-31,-0.230898,-0.20618
2002-12-31,-0.192575,-0.175167
2003-12-31,0.097735,0.10267
2004-12-31,-0.167233,-0.153997


In [25]:
Month_data['lag_Raw_return'] = Month_data['Raw_return'].shift(1)
Month_data

Unnamed: 0_level_0,Log_return,Raw_return,lag_Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-01-31,-0.141139,-0.131631,
1995-02-28,-0.023979,-0.023694,-0.131631
1995-03-31,0.163651,0.177803,-0.023694
1995-04-30,-0.109315,-0.103552,0.177803
1995-05-31,0.188901,0.207922,-0.103552
...,...,...,...
2022-03-31,-0.062604,-0.060685,0.030008
2022-04-30,-0.065154,-0.063077,-0.060685
2022-05-31,0.044724,0.045739,-0.063077
2022-06-30,0.064468,0.066592,0.045739


In [27]:
Quarter_data['lag_Raw_return'] = Quarter_data['Raw_return'].shift(1)
Quarter_data

Unnamed: 0_level_0,Log_return,Raw_return,lag_Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-03-31,-0.001467,-0.001466,
1995-06-30,-0.025583,-0.025258,-0.001466
1995-09-30,0.135980,0.145660,-0.025258
1995-12-31,-0.263130,-0.231358,0.145660
1996-03-31,0.001979,0.001981,-0.231358
...,...,...,...
2021-09-30,-0.006434,-0.006413,0.043374
2021-12-31,0.019870,0.020069,-0.006413
2022-03-31,-0.112592,-0.106484,0.020069
2022-06-30,0.044038,0.045022,-0.106484


In [28]:
daily_data['lag_Raw_return'] = daily_data['Raw_return'].shift(1)
daily_data

Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money,Raw_return,Log_return,lag_Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1995-01-03,647.87,637.72,647.71,630.53,639.88,23451800,1.806930e+08,-0.012333,-0.012409,
1995-01-04,639.88,641.90,655.51,638.86,653.81,42222000,3.069230e+08,0.021770,0.021536,-0.012333
1995-01-05,653.81,655.38,657.52,645.81,646.89,43012300,3.015330e+08,-0.010584,-0.010641,0.021770
1995-01-06,646.89,642.75,643.89,636.33,640.76,48748200,3.537580e+08,-0.009476,-0.009521,-0.010584
1995-01-09,640.76,637.52,637.55,625.04,626.00,50985100,3.985190e+08,-0.023035,-0.023305,-0.009476
...,...,...,...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11,-0.005988,-0.006006,-0.000620
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11,0.008322,0.008288,-0.005988
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11,-0.000513,-0.000513,0.008322
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11,0.002082,0.002080,-0.000513


Model:

$ r_t = α+β∗r_(t−1)+ϵ_t $

where rt is the raw return of stock market on month t.

H0: β=0

H1: β≠0

In [26]:
# 使用t-1月的收益率预测t月的收益率
# Newey West 检验 自相关
# 一般在日数据里面，lag12，月lag6，季度lag2

model1_mom = smf.ols('Raw_return ~ lag_Raw_return',
                data=Month_data['2000-01':'2022-07']).fit(
                    cov_type='HAC',cov_kwds={'maxlags':6})
print(model1_mom.summary())

                            OLS Regression Results                            
Dep. Variable:             Raw_return   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     3.114
Date:                Tue, 25 Oct 2022   Prob (F-statistic):             0.0787
Time:                        09:11:08   Log-Likelihood:                 329.40
No. Observations:                 271   AIC:                            -654.8
Df Residuals:                     269   BIC:                            -647.6
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0051      0.005      1.

In [29]:
model2_mom = smf.ols('Raw_return ~ lag_Raw_return',
                 data=Quarter_data['2000-01':'2022-07']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 2})
print(model2_mom.summary())

                            OLS Regression Results                            
Dep. Variable:             Raw_return   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     2.207
Date:                Tue, 25 Oct 2022   Prob (F-statistic):              0.141
Time:                        09:24:05   Log-Likelihood:                 45.496
No. Observations:                  90   AIC:                            -86.99
Df Residuals:                      88   BIC:                            -81.99
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0176      0.016      1.

In [30]:
model3_mom = smf.ols('Raw_return ~ lag_Raw_return',
                 data=daily_data['2000-01':'2022-07']).fit(
                     cov_type='HAC', cov_kwds={'maxlags': 2})
print(model3_mom.summary())

                            OLS Regression Results                            
Dep. Variable:             Raw_return   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.204
Date:                Tue, 25 Oct 2022   Prob (F-statistic):              0.273
Time:                        09:24:20   Log-Likelihood:                 15158.
No. Observations:                5470   AIC:                        -3.031e+04
Df Residuals:                    5468   BIC:                        -3.030e+04
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0003      0.000      1.

## 整合结果

In [31]:
from statsmodels.iolib.summary2 import summary_col

info_dict = {'No. observations': lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[model1_mom, model2_mom, model3_mom],
                            float_format='%0.3f',     #数据显示的格式，默认四位小数
                            stars=True,     # 是否有*，True为有
                            model_names=['Daily MOM', 'Month MOM', 'Quarter MOM'],
                            info_dict=info_dict,
                            regressor_order=['Intercept', 'lag_Raw_Return'])

results_table.add_title(
    'Table - OLS Regressions: Forecast Stock Market Return')

print(results_table)

Table - OLS Regressions: Forecast Stock Market Return
                 Daily MOM Month MOM Quarter MOM
------------------------------------------------
Intercept        0.005     0.018     0.000      
                 (0.005)   (0.016)   (0.000)    
lag_Raw_return   0.127*    0.162     0.021      
                 (0.072)   (0.109)   (0.019)    
R-squared        0.016     0.026     0.000      
R-squared Adj.   0.012     0.015     0.000      
No. observations 271       90        5470       
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
