In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.impute import SimpleImputer

In [52]:
df_train = pd.read_csv('../../data/processed/train.csv')

In [53]:
X = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

In [54]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ValueError: could not convert string to float: 'RL'

In [39]:
ridge = RidgeCV()
lasso = LassoCV()

ridge.fit(X_train_scaled, y_train)
lasso.fit(X_train_scaled, y_train)

In [40]:
X_train_scaled

array([[-0.09671886,  0.        , -0.11043153, ..., -0.23133127,
        -0.34116331, -0.34043991],
       [-0.09671886,  0.        , -0.11043153, ..., -0.90415705,
         0.91306228, -0.60830931],
       [-0.09671886,  0.        , -0.11043153, ..., -0.62023411,
         0.91306228, -0.37258424],
       ...,
       [-0.09671886,  0.        , -0.11043153, ..., -0.5844455 ,
        -0.9682761 , -1.58335392],
       [-0.09671886,  0.        , -0.11043153, ..., -0.34108299,
         0.91306228,  0.74175246],
       [-0.09671886,  0.        , -0.11043153, ..., -0.15975405,
         0.28594948, -1.0101134 ]])

In [41]:
y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

In [42]:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print(f"Ridge MSE: {mse_ridge:.4f}")
print(f"Lasso MSE: {mse_lasso:.4f}")

Ridge MSE: 629744472.7741
Lasso MSE: 640812442.8614


In [43]:
df_test = pd.read_csv('../../data/processed/test.csv')
X_test_scaled

array([[-0.09671886,  0.        , -0.11043153, ..., -0.62500592,
         2.16728786, -0.10471484],
       [-0.09671886,  0.        , -0.11043153, ..., -1.72490906,
         0.28594948, -0.59759453],
       [-0.09671886,  0.        , -0.11043153, ...,  2.23092478,
         0.91306228,  4.74907865],
       ...,
       [-0.09671886,  0.        , -0.11043153, ...,  0.63236708,
         0.28594948, -0.24936431],
       [-0.09671886,  0.        , -0.11043153, ..., -0.58921732,
         0.28594948,  0.32923358],
       [-0.09671886,  0.        , -0.11043153, ..., -1.03776784,
        -0.34116331, -0.34043991]])

In [44]:
imputer = SimpleImputer(strategy='most_frequent')
df_test_imput = pd.DataFrame(imputer.fit_transform(df_test), columns=df_test.columns)
df_test_imput.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 58 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   cat__Neighborhood_Blmngtn  1459 non-null   float64
 1   cat__Neighborhood_Blueste  1459 non-null   float64
 2   cat__Neighborhood_BrDale   1459 non-null   float64
 3   cat__Neighborhood_BrkSide  1459 non-null   float64
 4   cat__Neighborhood_ClearCr  1459 non-null   float64
 5   cat__Neighborhood_CollgCr  1459 non-null   float64
 6   cat__Neighborhood_Crawfor  1459 non-null   float64
 7   cat__Neighborhood_Edwards  1459 non-null   float64
 8   cat__Neighborhood_Gilbert  1459 non-null   float64
 9   cat__Neighborhood_IDOTRR   1459 non-null   float64
 10  cat__Neighborhood_MeadowV  1459 non-null   float64
 11  cat__Neighborhood_Mitchel  1459 non-null   float64
 12  cat__Neighborhood_NAmes    1459 non-null   float64
 13  cat__Neighborhood_NPkVill  1459 non-null   float

In [46]:
df_test_imput.drop(columns=['cat__MSZoning_nan'], inplace=True)

In [47]:
y_pred_ridge = ridge.predict(df_test_imput)
y_pred_lasso = lasso.predict(df_test_imput)



In [49]:
df_ridge = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': y_pred_ridge})
df_ridge.to_csv('submission_ridge.csv', index=False)

In [50]:
df_lasso = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': y_pred_lasso})
df_lasso.to_csv('submission_lasso.csv', index=False)