In [109]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.impute import SimpleImputer

In [110]:
df_train = pd.read_csv('../../data/processed/train.csv')

In [111]:
X = df_train.drop('remainder__SalePrice', axis=1)
y = df_train['remainder__SalePrice']

In [112]:
X

Unnamed: 0,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,cat__Neighborhood_BrDale,cat__Neighborhood_BrkSide,cat__Neighborhood_ClearCr,cat__Neighborhood_CollgCr,cat__Neighborhood_Crawfor,cat__Neighborhood_Edwards,cat__Neighborhood_Gilbert,cat__Neighborhood_IDOTRR,...,remainder__KitchenQual,remainder__GarageQual,remainder__BsmtQual,remainder__YearRemodAdd,remainder__GarageYrBlt,remainder__Fireplaces,remainder__BsmtFullBath,remainder__FullBath,remainder__BedroomAbvGr,remainder__KitchenAbvGr
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,4,3,4,2003,2003.0,0,1,2,3,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,3,4,1976,1976.0,1,0,2,3,1
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,4,3,4,2002,2001.0,1,1,2,3,1
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,4,3,3,1970,1998.0,1,1,1,3,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,3,4,2000,2000.0,1,1,2,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3,3,4,2000,1999.0,1,0,2,3,1
1345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,3,4,1988,1978.0,2,1,2,3,1
1346,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,4,3,3,2006,1941.0,2,0,2,4,1
1347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,3,3,1996,1950.0,0,1,1,2,1


In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [114]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [115]:
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1, max_iter=5000)

ridge.fit(X_train_scaled, y_train)
lasso.fit(X_train_scaled, y_train)

In [116]:
X_train_scaled

array([[-0.09671886,  0.        , -0.11043153, ..., -1.04964589,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         1.51025865, -0.17231495],
       ...,
       [-0.09671886,  0.        , -0.11043153, ..., -1.04964589,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         1.51025865, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495]])

In [117]:
y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

In [118]:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f"Ridge MSE: {mse_ridge:.4f}")
print(f"Lasso MSE: {mse_lasso:.4f}")

Ridge MSE: 682340793.0058
Lasso MSE: 681505579.4893


In [119]:
df_test = pd.read_csv('../../data/processed/test.csv')
X_test_scaled

array([[-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         1.51025865, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       ...,
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ..., -1.04964589,
         0.20346708, -0.17231495]])

In [120]:
imputer = SimpleImputer(strategy='most_frequent')
df_test_imput = pd.DataFrame(imputer.fit_transform(df_test), columns=df_test.columns)
df_test_imput.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 53 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   cat__Neighborhood_Blmngtn  1459 non-null   float64
 1   cat__Neighborhood_Blueste  1459 non-null   float64
 2   cat__Neighborhood_BrDale   1459 non-null   float64
 3   cat__Neighborhood_BrkSide  1459 non-null   float64
 4   cat__Neighborhood_ClearCr  1459 non-null   float64
 5   cat__Neighborhood_CollgCr  1459 non-null   float64
 6   cat__Neighborhood_Crawfor  1459 non-null   float64
 7   cat__Neighborhood_Edwards  1459 non-null   float64
 8   cat__Neighborhood_Gilbert  1459 non-null   float64
 9   cat__Neighborhood_IDOTRR   1459 non-null   float64
 10  cat__Neighborhood_MeadowV  1459 non-null   float64
 11  cat__Neighborhood_Mitchel  1459 non-null   float64
 12  cat__Neighborhood_NAmes    1459 non-null   float64
 13  cat__Neighborhood_NPkVill  1459 non-null   float

In [121]:
y_pred_ridge = ridge.predict(df_test_imput)
y_pred_lasso = lasso.predict(df_test_imput)



ValueError: X has 53 features, but Ridge is expecting 52 features as input.

In [63]:
df_ridge = pd.DataFrame({'Id': df_test['remainder__Id'], 'SalePrice': y_pred_ridge})
df_ridge.to_csv('submission_ridge.csv', index=False)

In [64]:
df_lasso = pd.DataFrame({'Id': df_test['remainder__Id'], 'SalePrice': y_pred_lasso})
df_lasso.to_csv('submission_lasso.csv', index=False)