In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

data = pd.read_csv('property_sale/train.csv')

target = 'SalePrice'
labels = data[target]
data = data.drop(columns=[target])
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


In [46]:
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

filling = data[numerical_features].median()
data[numerical_features] = data[numerical_features].fillna(filling)
data[categorical_features] = data[categorical_features].fillna('Missing')

# Преобразование категориальных признаков в числовые с использованием one-hot encoding 
data = pd.get_dummies(data, columns=categorical_features, drop_first=True) # o ma gad kakoe van hot encoding bruuuh

data.drop(['MSSubClass', 'OverallCond', 'BsmtUnfSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath',
        'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'WoodDeckSF', 'EnclosedPorch',
        '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold', 'MoSold', 'BsmtFinSF1', 'BsmtFinSF2',], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
X_train.drop([1298], axis=0, inplace=True)
y_train.drop([1298], axis=0, inplace=True)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
# Создание и обучение модели Ridge
model = Ridge(alpha=100)
# учим предсказывать логарифм
model.fit(X_train, np.log1p(y_train))
print(model.coef_)
# Прогнозирование
y_pred = model.predict(X_test)
# Обратное преобразование логарифма
y_pred_exp = np.expm1(y_pred)

# Оценка модели с использованием RMSE на логарифмированных значениях
log_rmse = np.sqrt(mean_squared_error(np.log1p(y_test), y_pred))
print("alpha:", 100)
print("Baseline Model Performance:")
print(f"Root Mean Squared Error (Log RMSE): {log_rmse:.4f}")

[-4.20253912e-04  4.98432705e-03  1.70670576e-02  5.78320820e-02
  2.37516972e-02  3.48000629e-02  7.63576243e-03  3.40970729e-02
  2.57861499e-02  6.83220953e-02  6.33265587e-03  1.91436493e-02
  1.91904472e-02 -1.95387831e-03  1.39917575e-02  2.66609097e-02
  3.76738382e-03  9.11279122e-03  4.40646302e-03  1.33438115e-02
  2.32126248e-03  4.13970588e-04 -3.53674914e-03  3.93886856e-03
  5.61528674e-03  1.04989044e-03 -2.63938113e-03  6.36752664e-03
 -1.46518896e-03  6.55366919e-03 -4.18882914e-03  7.80732378e-03
 -7.63283471e-03 -3.65269526e-03 -5.55725448e-03  5.35381733e-04
 -5.19244236e-03 -2.91562831e-03 -8.61707622e-03  4.69121921e-03
  7.73926971e-03 -2.99040618e-03  2.29281166e-02 -1.07287324e-02
 -4.54049023e-03 -8.07791803e-03 -1.66986688e-02 -5.36444253e-03
 -4.47013975e-03  3.15840259e-03 -4.08342981e-03  1.08176827e-02
  1.70584730e-02 -1.06538138e-02  5.40581927e-03 -2.19341510e-03
 -1.85831604e-03  4.45262513e-03  1.89523918e-02  2.09218460e-03
  1.07867050e-02  2.53469

In [48]:
test_data = pd.read_csv('property_sale/test.csv')
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [49]:
test_data.drop(['MSSubClass', 'OverallCond', 'BsmtUnfSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath',
        'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'WoodDeckSF', 'EnclosedPorch',
        '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold', 'MoSold', 'BsmtFinSF1', 'BsmtFinSF2'], axis=1, inplace=True)
test_data

Unnamed: 0,Id,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,GarageArea,GarageQual,GarageCond,PavedDrive,OpenPorchSF,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,1461,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,730.0,TA,TA,Y,0,,MnPrv,,WD,Normal
1,1462,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,312.0,TA,TA,Y,36,,,Gar2,WD,Normal
2,1463,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,482.0,TA,TA,Y,34,,MnPrv,,WD,Normal
3,1464,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,470.0,TA,TA,Y,36,,,,WD,Normal
4,1465,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,506.0,TA,TA,Y,82,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0.0,,,Y,0,,,,WD,Normal
1455,2916,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,286.0,TA,TA,Y,24,,,,WD,Abnorml
1456,2917,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,576.0,TA,TA,Y,0,,,,WD,Abnorml
1457,2918,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0.0,,,Y,32,,MnPrv,Shed,WD,Normal


In [50]:
numerical_features_test = test_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features_test = test_data.select_dtypes(include=['object']).columns

test_data[numerical_features_test] = test_data[numerical_features_test].fillna(filling)
test_data[categorical_features_test] = test_data[categorical_features_test].fillna('Missing')
test_data = pd.get_dummies(test_data, columns=categorical_features_test, drop_first=True)

# Выравнивание столбцов обучающих и тестовых данных
test_data = test_data.reindex(columns=data.columns, fill_value=0)

X_test = scaler.transform(test_data)

y_pred_test = model.predict(X_test)
y_pred_test_exp = np.expm1(y_pred_test)

predictions = pd.DataFrame({"Id": test_data.Id, "SalePrice": y_pred_test_exp})
predictions.to_csv("test_predictions.csv", index=False)