forked from statsmodels/statsmodels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ex_feasible_gls_het_0.py
174 lines (142 loc) · 6.3 KB
/
ex_feasible_gls_het_0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -*- coding: utf-8 -*-
"""Examples for linear model with heteroscedasticity estimated by feasible GLS
These are examples to check the results during developement.
The assumptions:
We have a linear model y = X*beta where the variance of an observation depends
on some explanatory variable Z (`exog_var`).
linear_model.WLS estimated the model for a given weight matrix
here we want to estimate also the weight matrix by two step or iterative WLS
Created on Wed Dec 21 12:28:17 2011
Author: Josef Perktold
"""
from __future__ import print_function
import numpy as np
from numpy.testing import assert_almost_equal
from statsmodels.regression.linear_model import OLS, WLS, GLS
from statsmodels.regression.feasible_gls import GLSHet, GLSHet2
from statsmodels.tools.tools import add_constant
examples = ['ex1']
if 'ex1' in examples:
nsample = 300 #different pattern last graph with 100 or 200 or 500
sig = 0.5
np.random.seed(9876789) #9876543)
X = np.random.randn(nsample, 3)
X = np.column_stack((np.ones((nsample,1)), X))
beta = [1, 0.5, -0.5, 1.]
y_true2 = np.dot(X, beta)
x1 = np.linspace(0, 1, nsample)
gamma = np.array([1, 3.])
#with slope 3 instead of two, I get negative weights, Not correct
# - was misspecified, but the negative weights are still possible with identity link
#gamma /= gamma.sum() #normalize assuming x1.max is 1
z_true = add_constant(x1)
winv = np.dot(z_true, gamma)
het_params = sig**2 * np.array([1, 3.]) # for squared
sig2_het = sig**2 * winv
weights_dgp = 1/winv
weights_dgp /= weights_dgp.max() #should be already normalized - NOT check normalization
#y2[:nsample*6/10] = y_true2[:nsample*6/10] + sig*1. * np.random.normal(size=nsample*6/10)
z0 = np.zeros(nsample)
z0[(nsample * 5)//10:] = 1 #dummy for 2 halfs of sample
z0 = add_constant(z0)
z1 = add_constant(x1)
noise = np.sqrt(sig2_het) * np.random.normal(size=nsample)
y2 = y_true2 + noise
X2 = X[:,[0,2]] #misspecified, missing regressor in main equation
X2 = X #correctly specigied
res_ols = OLS(y2, X2).fit()
print('OLS beta estimates')
print(res_ols.params)
print('OLS stddev of beta')
print(res_ols.bse)
print('\nWLS')
mod0 = GLSHet2(y2, X2, exog_var=winv)
res0 = mod0.fit()
print('new version')
mod1 = GLSHet(y2, X2, exog_var=winv)
res1 = mod1.iterative_fit(2)
print('WLS beta estimates')
print(res1.params)
print(res0.params)
print('WLS stddev of beta')
print(res1.bse)
#compare with previous version GLSHet2, refactoring check
#assert_almost_equal(res1.params, np.array([ 0.37642521, 1.51447662]))
#this fails ??? more iterations? different starting weights?
print(res1.model.weights/res1.model.weights.max())
#why is the error so small in the estimated weights ?
assert_almost_equal(res1.model.weights/res1.model.weights.max(), weights_dgp, 14)
print('residual regression params')
print(res1.results_residual_regression.params)
print('scale of model ?')
print(res1.scale)
print('unweighted residual variance, note unweighted mean is not zero')
print(res1.resid.var())
#Note weighted mean is zero:
#(res1.model.weights * res1.resid).mean()
doplots = True #False
if doplots:
import matplotlib.pyplot as plt
plt.figure()
plt.plot(x1, y2, 'o')
plt.plot(x1, y_true2, 'b-', label='true')
plt.plot(x1, res1.fittedvalues, 'r-', label='fwls')
plt.plot(x1, res_ols.fittedvalues, '--', label='ols')
plt.legend()
#the next only works if w has finite support, discrete/categorical
#z = (w[:,None] == [1,4]).astype(float) #dummy variable
#z = (w0[:,None] == np.unique(w0)).astype(float) #dummy variable
#changed z0 contains dummy and constant
mod2 = GLSHet(y2, X2, exog_var=z0)
res2 = mod2.iterative_fit(3)
print(res2.params)
import statsmodels.api as sm
#z = sm.add_constant(w, prepend=True)
z = sm.add_constant(x1/x1.max())
mod3 = GLSHet(y2, X2, exog_var=z1)#, link=sm.families.links.log())
res3 = mod3.iterative_fit(20)
error_var_3 = res3.mse_resid/res3.model.weights
print(res3.params)
print("np.array(res3.model.history['ols_params'])")
print(np.array(res3.model.history['ols_params']))
print("np.array(res3.model.history['self_params'])")
print(np.array(res3.model.history['self_params']))
#Models 2 and 3 are equivalent with different parameterization of Z
print(np.unique(res2.model.weights)) #for discrete z only, only a few uniques
print(np.unique(res3.model.weights))
print(res3.summary())
print('\n\nResults of estimation of weights')
print('--------------------------------')
print(res3.results_residual_regression.summary())
if doplots:
plt.figure()
plt.plot(x1, y2, 'o')
plt.plot(x1, y_true2, 'b-', label='true')
plt.plot(x1, res1.fittedvalues, '-', label='fwls1')
plt.plot(x1, res2.fittedvalues, '-', label='fwls2')
plt.plot(x1, res3.fittedvalues, '-', label='fwls3')
plt.plot(x1, res_ols.fittedvalues, '--', label='ols')
plt.legend()
plt.figure()
plt.ylim(0, 5)
res_e2 = OLS(noise**2, z).fit()
plt.plot(noise**2, 'bo', alpha=0.5, label='dgp error**2')
plt.plot(res_e2.fittedvalues, lw=2, label='ols for noise**2')
#plt.plot(res3.model.weights, label='GLSHet weights')
plt.plot(error_var_3, lw=2, label='GLSHet error var')
plt.plot(res3.resid**2, 'ro', alpha=0.5, label='resid squared')
#plt.plot(weights_dgp, label='DGP weights')
plt.plot(sig**2 * winv, lw=2, label='DGP error var')
plt.legend()
plt.show()
'''Note these are close but maybe biased because of skewed distribution
>>> res3.mse_resid/res3.model.weights[-10:]
array([ 1.03115871, 1.03268209, 1.03420547, 1.03572885, 1.03725223,
1.03877561, 1.04029899, 1.04182237, 1.04334575, 1.04486913])
>>> res_e2.fittedvalues[-10:]
array([ 1.0401953 , 1.04171386, 1.04323242, 1.04475098, 1.04626954,
1.0477881 , 1.04930666, 1.05082521, 1.05234377, 1.05386233])
>>> sig**2 * w[-10:]
array([ 0.98647295, 0.98797595, 0.98947896, 0.99098196, 0.99248497,
0.99398798, 0.99549098, 0.99699399, 0.99849699, 1. ])
'''