In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
df = pd.read_csv("../data/train.csv")
df.head()


In [None]:
df.info()
df.describe()
df.isnull().sum()


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df['SalePrice'], kde=True)
plt.title("Sale Price Distribution")
plt.show()
    

In [None]:
df = df.fillna(df.mean(numeric_only=True))
df = pd.get_dummies(df, drop_first=True)

X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
lr = LinearRegression().fit(X_train, y_train)
ridge = Ridge(alpha=1.0).fit(X_train, y_train)
lasso = Lasso(alpha=0.001).fit(X_train, y_train)


In [None]:
models = [("Linear", lr), ("Ridge", ridge), ("Lasso", lasso)]

for name, model in models:
    pred = model.predict(X_test)
    print(name)
    print("MAE:", mean_absolute_error(y_test, pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))
    print("R2:", r2_score(y_test, pred))
    print("---------")


In [None]:
import joblib

joblib.dump(ridge, "../models/house_price_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

"Model saved!"
