In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from scripts.pipeline import Pipeline

In [2]:
# Load the data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

# split train data into predictors and response
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice'] 

#combine the data for preprocessing
data = pd.concat([X, test_data], axis=0)
# Preprocess the data
data = Pipeline(data)

# Split the data back into train and test, processed
X = data[:len(train_data)]
test = data[len(train_data):]
#increase test index by 1
test.index = test.index + 1

# Split the data into features and response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# try a few models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

models = [
    LinearRegression(),
    RandomForestRegressor(),
    SVR(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    BaggingRegressor(),
    ExtraTreesRegressor(),
    HistGradientBoostingRegressor(),
]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{model.__class__.__name__} MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'{model.__class__.__name__} R^2: {model.score(X_test, y_test)}')


LinearRegression MSE: 1178244504.008496
LinearRegression R^2: 0.8463892574862752
