In [62]:
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.metrics import r2_score

In [63]:
House_Price = pd.read_csv('Data/HousePrice.csv')
Scores = pd.read_csv('Data/Scores.csv')

In [64]:
Selected_Columns = ['Local authority code', 'LSOA code', 'LSOA name', 'Year ending Mar 2019',
                     'Year ending Jun 2019', 'Year ending Sep 2019', 'Year ending Dec 2019']

House_Price_2019 = House_Price[Selected_Columns]

House_Price_2019 = House_Price_2019.rename(columns={
    'Local authority code': 'Local_Authority_Code',
    'LSOA code': 'LSOA_Code',
    'LSOA name': 'LSOA_Name',
    'Year ending Mar 2019': 'Mar_2019',
    'Year ending Jun 2019': 'Jun_2019',
    'Year ending Sep 2019': 'Sep_2019',
    'Year ending Dec 2019': 'Dec_2019'
})

House_Price_2019['Mar_2019'] = pd.to_numeric(House_Price_2019['Mar_2019'].str.replace(',', ''), errors='coerce')
House_Price_2019['Jun_2019'] = pd.to_numeric(House_Price_2019['Jun_2019'].str.replace(',', ''), errors='coerce')
House_Price_2019['Sep_2019'] = pd.to_numeric(House_Price_2019['Sep_2019'].str.replace(',', ''), errors='coerce')
House_Price_2019['Dec_2019'] = pd.to_numeric(House_Price_2019['Dec_2019'].str.replace(',', ''), errors='coerce')

House_Price_2019['Year_2019'] = House_Price_2019[['Mar_2019', 'Jun_2019', 'Sep_2019', 'Dec_2019']].mean(axis=1, skipna=False).fillna(0)

House_Price_2019 = House_Price_2019[House_Price_2019['Year_2019']!=0]

In [65]:
Scores_Filtered = Scores[['LSOA code (2011)', 'Income Score (rate)', 'Education, Skills and Training Score', 'Crime Score', 'Barriers to Housing and Services Score', 'Living Environment Score']]

Scores_Filtered = Scores_Filtered.rename(columns={
    'LSOA code (2011)': 'LSOA_Code',
    'Income Score (rate)': 'Income_Score',
    'Education, Skills and Training Score': 'Education_Score',
    'Barriers to Housing and Services Score': 'Barriers_Score',
    'Crime Score': 'Crime_Score',
    'Living Environment Score': 'Living_Score'
})

In [66]:
Merged_df = pd.merge(House_Price_2019, Scores_Filtered, on='LSOA_Code')

Merged_df_Filtered = Merged_df[['Year_2019', 'Income_Score', 'Education_Score', 'Crime_Score', 'Barriers_Score', 'Living_Score']]

In [67]:
Merged_df_Filered_log = Merged_df_Filtered.copy()

Merged_df_Filered_log['Year_2019'] = np.log(Merged_df_Filered_log['Year_2019'])

Merged_df_Filered_log['Education_Score'] = Merged_df_Filered_log['Education_Score'] - Merged_df_Filered_log['Education_Score'].min() + 1
Merged_df_Filered_log['Education_Score'] = np.log(Merged_df_Filered_log['Education_Score'])

In [70]:
Model3_Data = Merged_df_Filered_log.copy()

Model3_Data['Crose_With_Education_and_Income'] = Model3_Data['Income_Score'] * Model3_Data['Education_Score']
Model3_Data['Crose_With_Living_and_Income'] = Model3_Data['Income_Score'] * Model3_Data['Living_Score']
Model3_Data['Crose_With_Crime_and_Income'] = Model3_Data['Income_Score'] * Model3_Data['Crime_Score']
Model3_Data['Crose_With_Barriers_and_Income'] = Model3_Data['Income_Score'] * Model3_Data['Barriers_Score']
#Model3_Data['Income_Square'] = Model3_Data['Income_Score'] * Model3_Data['Income_Score']
Model3_Data['Crose_With_Barriers_and_Living'] = Model3_Data['Living_Score'] * Model3_Data['Barriers_Score']
Model3_Data['Crose_With_Barriers_and_Crime'] = Model3_Data['Crime_Score'] * Model3_Data['Barriers_Score']
Model3_Data['Crose_With_Barriers_and_Education'] = Model3_Data['Education_Score'] * Model3_Data['Barriers_Score']
Model3_Data['Crose_With_Living_and_Education'] = Model3_Data['Education_Score'] * Model3_Data['Living_Score']
Model3_Data['Crose_With_Living_and_Crime'] = Model3_Data['Living_Score'] * Model3_Data['Crime_Score']

Model3_Data['Crose_With_Education_and_Crime'] = Model3_Data['Education_Score'] * Model3_Data['Crime_Score']


Model3_Data['Education_Square'] = Model3_Data['Education_Score'] * Model3_Data['Education_Score']
Model3_Data['Crime_Square'] = Model3_Data['Crime_Score'] * Model3_Data['Crime_Score']
Model3_Data['Barriers_Square'] = Model3_Data['Barriers_Score'] * Model3_Data['Barriers_Score']
Model3_Data['Living_Square'] = Model3_Data['Living_Score'] * Model3_Data['Living_Score']

In [71]:
#Independent_Variables = Model3_Data[['Crose_With_Education_and_Income', 'Crose_With_Living_and_Income', 'Crose_With_Crime_and_Income', 'Crose_With_Barriers_and_Income',
#                                      'Crose_With_Barriers_and_Living', 'Crose_With_Barriers_and_Crime', 'Crose_With_Barriers_and_Education',
#                                      'Income_Score', 'Education_Score', 'Crime_Score', 'Barriers_Score', 'Living_Score']]

# Independent_Variables = Model3_Data[['Crose_With_Living_and_Income', 'Crose_With_Crime_and_Income',
#                                     'Crose_With_Barriers_and_Education',
#                                       'Income_Score', 'Education_Score', 'Barriers_Score', 'Living_Score']]

Independent_Variables = Model3_Data[['Crose_With_Education_and_Crime',
                                  'Crose_With_Barriers_and_Education', 'Crose_With_Barriers_and_Living',
                                  'Crose_With_Education_and_Income', 'Crose_With_Crime_and_Income',
                                  'Income_Score', 'Education_Score', 'Crime_Score', 'Barriers_Score', 'Living_Score']]

#Independent_Variables = Model3_Data[['Crime_Square', 'Education_Square', 'Crose_With_Barriers_and_Living', 'Crose_With_Education_and_Income', 'Income_Score', 'Barriers_Score', 'Living_Score']]

#Independent_Variables = Model3_Data[['Crime_Square', 'Education_Square', 'Crose_With_Barriers_and_Living', 'Income_Score', 'Barriers_Score', 'Living_Score']]

Dependent_Variable = Model3_Data[['Year_2019']]

Independent_Variables_Train, Independent_Variables_Test, Dependent_Variable_Train, Dependent_Variable_Test = train_test_split(Independent_Variables, Dependent_Variable, test_size=0.2, random_state=42)

In [72]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(Independent_Variables_Train)
X_test_scaled = scaler.transform(Independent_Variables_Test)
y_train_scales = scaler.fit_transform(Dependent_Variable_Train)
y_test_scales = scaler.fit_transform(Dependent_Variable_Test)

In [73]:
elastic_net = ElasticNet()

# 定义超参数空间
param_grid = {
    'alpha': [0.1, 0.25, 0.5, 0.75, 1.0],
    'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

# 创建GridSearchCV对象
grid_search = GridSearchCV(estimator=elastic_net, param_grid=param_grid, cv=3)

# 在训练集上拟合模型并搜索最佳超参数
grid_search.fit(X_train_scaled, y_train_scales)

# 打印最佳超参数
print("最佳超参数:", grid_search.best_params_)

最佳超参数: {'alpha': 0.1, 'l1_ratio': 0.1}


In [74]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.1, random_state=42)
elastic_net.fit(X_train_scaled, y_train_scales)
y_pred = elastic_net.predict(X_test_scaled)
mse = mean_squared_error(Dependent_Variable_Test.values, y_pred)
print("均方误差 (MSE):", mse)

#print(y_pred.shape)

#r_squared = elastic_net.score(y_pred, Dependent_Variable_Test.values)
#r_squared = r2_score(Dependent_Variable_Test.values, y_pred)

#print("决定系数 (R方):", r_squared)
elastic_net.coef_

均方误差 (MSE): 172.29956952039902


array([-0.        , -0.17934041, -0.        , -0.        ,  0.03406411,
       -0.08287617, -0.37616226,  0.        , -0.0366583 ,  0.18726305])

In [41]:
# 弹性网络模型的系数
coef_elastic_net = elastic_net.coef_

# 添加截距项
X_with_intercept = sm.add_constant(Independent_Variables)

# 使用statsmodels进行显著性测试
model = sm.OLS(Dependent_Variable, X_with_intercept).fit()

# 打印每个系数及其对应的p-value
for i, coef in enumerate(coef_elastic_net):
    p_val = model.pvalues[i + 1]  # 使用 model.pvalues 获取每个系数的p-value
    print(f'Coefficient {i + 1}: {coef}, p-value: {p_val}')

Coefficient 1: 0.0411899657364987, p-value: 1.240960043321746e-06
Coefficient 2: -0.4293264861974172, p-value: 8.427316298391162e-169
Coefficient 3: -0.03421204897934483, p-value: 1.1637871619154208e-20
Coefficient 4: -0.0988068954218, p-value: 2.9550992659557285e-10
Coefficient 5: -0.12379959446890754, p-value: 4.552801958892138e-06
Coefficient 6: 0.21332246397815294, p-value: 8.841611631695323e-41


In [22]:
columns_to_normalize = ['Income_Score', 'Education_Score', 'Crime_Score', 'Barriers_Score', 'Living_Score', 'Year_2019']

# 创建 MinMaxScaler 实例
scaler = MinMaxScaler()

# 在 Merged_df_Filtered 上拟合并转换数据
Merged_df_Filtered_Normalised = Merged_df_Filtered.copy()
Merged_df_Filtered_Normalised[columns_to_normalize] = scaler.fit_transform(Merged_df_Filtered[columns_to_normalize])


Independent_Variables = Merged_df_Filtered_Normalised[['Income_Score', 'Education_Score', 'Crime_Score', 'Barriers_Score', 'Living_Score']]
Dependent_Variable = Merged_df_Filtered_Normalised['Year_2019']

# 添加截距项
X_with_intercept = np.c_[np.ones(Independent_Variables.shape[0]), Independent_Variables]

# 计算模型参数
params = np.linalg.lstsq(X_with_intercept, Dependent_Variable, rcond=None)[0]
n = len(Dependent_Variable)
p = X_with_intercept.shape[1] - 1

# 残差平方和
residuals = Dependent_Variable - np.dot(X_with_intercept, params)
sse = np.sum(residuals ** 2)

# 估计的标准误差
se = np.sqrt(sse / (n - p - 1))

# 参数的标准误差
params_se = np.sqrt(np.diag(np.linalg.inv(np.dot(X_with_intercept.T, X_with_intercept)) * se**2))

# t统计量
t_values = params / params_se

# p-value
p_values = 2 * (1 - stats.t.cdf(np.abs(t_values), n - p - 1))

# 打印系数和对应的p-value
for i, (coef, p_val) in enumerate(zip(params, p_values[1:])):
    print(f'Coefficient {i + 1}: {coef}, p-value: {p_val}')

Coefficient 1: 0.12058949101201745, p-value: 0.0
Coefficient 2: -0.05043953651472963, p-value: 0.0
Coefficient 3: -0.10560077747302697, p-value: 0.2824783878663615
Coefficient 4: 0.009740478282329004, p-value: 2.6978419498391304e-13
Coefficient 5: -0.0502507210014269, p-value: 0.0
