One-Hot Encoding + lasso回归

lasso回归假设数据的所有特征都是数值型的，并且在回归中可以表现为线性关系。因此，对于分类变量或非线性关系，你可能需要采取其他的建模策略。

虽然你已经将property_type和borough这两个分类变量标签编码了，但是它们并不一定适合直接用于回归分析，因为标签编码引入了一个可能不存在的顺序关系。例如，如果你的borough变量包含三个值：'A', 'B', 'C'，标签编码可能会将它们转换为1，2，3。这意味着在进行回归分析时，模型会认为'B'比'A'大，'C'比'B'大，而实际上这可能并不是我们想要的。

一种更常见的处理分类变量的方法是使用独热编码（One-Hot Encoding）。独热编码会为每个类别创建一个新的虚拟变量，取值为1或0。例如，对于上述的borough变量，独热编码会创建三个新的变量：borough_A, borough_B, borough_C。如果一个样本的borough为'A'，那么borough_A为1，其他两个变量为0。

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# 读取CSV文件
data = pd.read_csv(r"D:\s-casa毕业论文-工程\R5R_TEST1_london\Test2_onmypoi_data\mergeResult\housing_hasId_accessibility.csv")

# 选择特征和因变量列
features = ['bedrooms', 'bathrooms', 'size_square_meters', 
            'AccommodationAccess15m', 'AirportsAccess15m',
            'Culture and tourismAccess15m', 'Eating and drinkingAccess15m',
            'EducationAccess15m', 'EntertainmentAccess15m', 'HealthAccess15m',
            'Infrastructure and facilitiesAccess15m',
            'Manufacturing and productionAccess15m', 'OfficeAccess15m',
            'Public transportAccess15m', 'Road and railAccess15m',
            'ServiceAccess15m', 'ShoppingAccess15m', 'Water transportAccess15m']
target = 'Price_per_square_meter'

# 提取自变量和因变量
X = data[features]
y = data[target]

# 将property_type和borough列进行one hot编码
columns_to_encode = ['property_type', 'borough']
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(data[columns_to_encode]))

# 将编码的列加入特征变量
X = pd.concat([X, X_encoded], axis=1)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [4]:
X_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Read CSV file
data = pd.read_csv(r"D:\s-casa毕业论文-工程\R5R_TEST1_london\Test2_onmypoi_data\mergeResult\housing_hasId_accessibility.csv")

# Select features and target column
features = ['bedrooms', 'bathrooms', 'size_square_meters', 
            'AccommodationAccess15m', 'AirportsAccess15m',
            'Culture and tourismAccess15m', 'Eating and drinkingAccess15m',
            'EducationAccess15m', 'EntertainmentAccess15m', 'HealthAccess15m',
            'Infrastructure and facilitiesAccess15m',
            'Manufacturing and productionAccess15m', 'OfficeAccess15m',
            'Public transportAccess15m', 'Road and railAccess15m',
            'ServiceAccess15m', 'ShoppingAccess15m', 'Water transportAccess15m']
target = 'Price_per_square_meter'

# Extract features and target
X = data[features]
y = data[target]

# Convert property_type and borough columns to string type
X['property_type'] = X['property_type'].astype(str)
X['borough'] = X['borough'].astype(str)

# One hot encode property_type and borough columns
columns_to_encode = ['property_type', 'borough']
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[columns_to_encode]))

# Reset column names for the encoded DataFrame
X_encoded.columns = encoder.get_feature_names(columns_to_encode)

# Concatenate the encoded columns with the feature variables
X = pd.concat([X, X_encoded], axis=1)

# Drop the original non-encoded columns
X.drop(columns=columns_to_encode, inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the Lasso regression model
alpha = 0.1  # Lasso regression regularization parameter
lasso_model = Lasso(alpha=alpha)
lasso_model.fit(X_train, y_train)

# Predict house prices on the test set
y_pred = lasso_model.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


KeyError: 'property_type'