In [138]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.impute import SimpleImputer

In [139]:
df_train = pd.read_csv('../../data/processed/train.csv')

In [140]:
X = df_train.drop('remainder__SalePrice', axis=1)
y = df_train['remainder__SalePrice']

In [141]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1349 entries, 0 to 1348
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   cat__Neighborhood_Blmngtn  1349 non-null   float64
 1   cat__Neighborhood_Blueste  1349 non-null   float64
 2   cat__Neighborhood_BrDale   1349 non-null   float64
 3   cat__Neighborhood_BrkSide  1349 non-null   float64
 4   cat__Neighborhood_ClearCr  1349 non-null   float64
 5   cat__Neighborhood_CollgCr  1349 non-null   float64
 6   cat__Neighborhood_Crawfor  1349 non-null   float64
 7   cat__Neighborhood_Edwards  1349 non-null   float64
 8   cat__Neighborhood_Gilbert  1349 non-null   float64
 9   cat__Neighborhood_IDOTRR   1349 non-null   float64
 10  cat__Neighborhood_MeadowV  1349 non-null   float64
 11  cat__Neighborhood_Mitchel  1349 non-null   float64
 12  cat__Neighborhood_NAmes    1349 non-null   float64
 13  cat__Neighborhood_NPkVill  1349 non-null   float

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [143]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [144]:
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1, max_iter=5000)

ridge.fit(X_train_scaled, y_train)
lasso.fit(X_train_scaled, y_train)

In [145]:
X_train_scaled

array([[-0.09671886,  0.        , -0.11043153, ..., -1.04964589,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         1.51025865, -0.17231495],
       ...,
       [-0.09671886,  0.        , -0.11043153, ..., -1.04964589,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         1.51025865, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495]])

In [146]:
y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

In [147]:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f"Ridge MSE: {mse_ridge:.4f}")
print(f"Lasso MSE: {mse_lasso:.4f}")

Ridge MSE: 682340793.0058
Lasso MSE: 681505579.4893


In [148]:
df_test = pd.read_csv('../../data/processed/test.csv')
X_test_scaled

array([[-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         1.51025865, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       ...,
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ...,  0.77120285,
         0.20346708, -0.17231495],
       [-0.09671886,  0.        , -0.11043153, ..., -1.04964589,
         0.20346708, -0.17231495]])

In [149]:
imputer = SimpleImputer(strategy='most_frequent')
df_test_imput = pd.DataFrame(imputer.fit_transform(df_test), columns=df_test.columns)
df_test_imput.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 53 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   cat__Neighborhood_Blmngtn  1459 non-null   float64
 1   cat__Neighborhood_Blueste  1459 non-null   float64
 2   cat__Neighborhood_BrDale   1459 non-null   float64
 3   cat__Neighborhood_BrkSide  1459 non-null   float64
 4   cat__Neighborhood_ClearCr  1459 non-null   float64
 5   cat__Neighborhood_CollgCr  1459 non-null   float64
 6   cat__Neighborhood_Crawfor  1459 non-null   float64
 7   cat__Neighborhood_Edwards  1459 non-null   float64
 8   cat__Neighborhood_Gilbert  1459 non-null   float64
 9   cat__Neighborhood_IDOTRR   1459 non-null   float64
 10  cat__Neighborhood_MeadowV  1459 non-null   float64
 11  cat__Neighborhood_Mitchel  1459 non-null   float64
 12  cat__Neighborhood_NAmes    1459 non-null   float64
 13  cat__Neighborhood_NPkVill  1459 non-null   float

In [150]:
df_test_imput.drop(columns=['cat__MSZoning_nan'], inplace=True)

In [151]:
y_pred_ridge = ridge.predict(df_test_imput)
y_pred_lasso = lasso.predict(df_test_imput)



In [152]:
df_ridge = pd.DataFrame({'Id': df_test['remainder__Id'], 'SalePrice': y_pred_ridge})
df_ridge.to_csv('submission_ridge.csv', index=False)

In [153]:
df_lasso = pd.DataFrame({'Id': df_test['remainder__Id'], 'SalePrice': y_pred_lasso})
df_lasso.to_csv('submission_lasso.csv', index=False)