In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Concatenate train and test data for preprocessing
full_data = pd.concat([train_data.drop(columns=['SalePrice']), test_data], axis=0)

# Data exploration and cleaning
# Check for missing values
missing_values = full_data.isnull().sum()
print("Missing Values:\n", missing_values[missing_values > 0])

# Impute missing values
# Fill numerical missing values with median
numerical_cols = full_data.select_dtypes(include=np.number).columns
full_data[numerical_cols] = full_data[numerical_cols].fillna(full_data[numerical_cols].median())

# Fill categorical missing values with mode
categorical_cols = full_data.select_dtypes(include='object').columns
full_data[categorical_cols] = full_data[categorical_cols].fillna(full_data[categorical_cols].mode().iloc[0])

# Feature engineering
# One-hot encode categorical variables
full_data = pd.get_dummies(full_data)

# Splitting data back into train and test sets
X_train = full_data[:train_data.shape[0]]
X_test = full_data[train_data.shape[0]:]
y_train = train_data['SalePrice']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model building and evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42)
}

best_model_name = None
best_rmse = float('inf')

# Train and evaluate each model using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(-scores)
    mean_rmse = rmse_scores.mean()
    std_rmse = rmse_scores.std()
    print(f"{name} RMSE: {mean_rmse} +/- {std_rmse}")

    # Update best model if current model has lower RMSE
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_model_name = name

print(f"\nBest Model: {best_model_name} with RMSE: {best_rmse}")

# Choose the best model based on cross-validated RMSE
best_model = models[best_model_name]
best_model.fit(X_train_scaled, y_train)

# Make predictions on test data
test_predictions = best_model.predict(X_test_scaled)

# Save predictions to a CSV file
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})
submission.to_csv('submission.csv', index=False)



Missing Values:
 MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType      1766
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64
Linear Regression RMSE: 1.2035345322132053e+17 +/- 8.67461613382299e+16
Ridge Regression RMSE: 34241.50418725384 +/- 8160.428207952073


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso Regression RMSE: 34493.830672389966 +/- 8656.11798777553
Random Forest RMSE: 29588.266401649198 +/- 3657.8518231102307

Best Model: Random Forest with RMSE: 29588.266401649198
