In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import joblib

# Load the data
data = pd.read_csv('train.csv')


In [12]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,TotalSF
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,2566
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,2524
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,2706
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,2473
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,3343


In [2]:
# Feature engineering
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

# Define the features and target variable
X = data[['GrLivArea', 'BedroomAbvGr', 'FullBath','TotalSF']]
y = data['SalePrice']

In [10]:
X.isnull().sum()

GrLivArea       0
BedroomAbvGr    0
FullBath        0
TotalSF         0
dtype: int64

In [11]:
y.isnull().sum()

0

In [3]:

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Create a Lasso regression model with feature selection and regularization

model = Lasso(alpha=0.1)
model.fit(X_train, y_train)

In [5]:
# Save the model
joblib.dump(model, 'lasso_regression_model.joblib')

# Load the model
model = joblib.load('lasso_regression_model.joblib')

In [6]:
# Predict the prices of houses in the test data
prices = model.predict(X_test)

In [7]:
print("Predicted prices:", prices)

Predicted prices: [126982.51857905 300775.27842685 140129.57010568 198865.08023654
 246809.61520904 107006.51308631 185137.1002684  174140.22875401
 107006.51308631 140560.33979263 176160.29826274 100971.54056703
 123252.73146797 215409.79431551 196762.41068846 138506.59032998
 194773.7099327  148836.67649832 139041.81344794 207495.41633835
 216990.17246948 180433.05245979 169602.3640171  128872.5742066
 202441.81548112 190477.38204319 190265.48954741 119238.55808323
 189888.91123435 210949.66630536 112689.41367361 229548.05320824
 346728.36105063 131013.46667846 229187.46199766 136555.22270081
 202668.93501866 192942.78973091 247758.96792439  75203.77271125
 103994.56457204 232108.74895518 111667.34771543 284197.14675032
 116751.9673361  205226.04046796 125661.23549881 112336.37661288
 287958.81687578 154244.95118445 110854.40677701 239717.908896
 104271.53132096 347621.89938764 136176.17301542 242984.96666699
 193971.22524527 161631.05273627 129379.11530834  70740.57832806
  92287.87

In [8]:
print("Actual prices:", y_test.values)

Actual prices: [154500 325000 115000 159000 315500  75500 311500 146000  84500 135500
 145000 130000  81000 214000 181000 134500 183500 135000 118400 226000
 155000 210000 173500 129000 192000 153900 181134 141000 181000 208900
 127000 284000 200500 135750 255000 140000 138000 219500 310000  97000
 114500 205000 119500 253293 128500 117500 115000 127000 451950 144000
 119000 196000 115000 287000 144500 260000 213000 175000 107000 107500
  68500 154000 317000 264132 283463 243000 109000 305000  93500 176000
 118858 134000 109008  93500 611657 173000 348000 341000 141000 124900
 118000  67000 113000  91300 149500 133000 266000 190000 155900 155835
 153500 152000 124500 301000 136500 169990 205000 183900 204900 260000
 163500 224900 244000 132000 194000 156500 156000 275000 145000 135000
  60000 124000 127000 137500 213500 119000 107900 123000 112000 284000
 133000 149000 169000 207000 175000 137000 236000  79500 144000 162900
 185900 369900 197900 104000  35311 337500 367294 130250 23000

In [9]:
print("R-squared score:", r2_score(y_test, prices))

R-squared score: 0.7233280277960107
