# Trying Different Models

🛠️ Imports

In [2]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

📂 Load Dataset

In [3]:
df = pd.read_csv('../data/housing_cleaned.csv')

🎯 Define Features and Target, Perform Train-Test Split

In [5]:
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=X['city'], random_state=42,
)

⚙️ Encode Features Using ColumnTransformer

In [6]:
binary = ['was_renovated']
categorical = ['city']
numerical = X.select_dtypes(include=['float64', 'int64']).columns.drop(binary).tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(drop='first'), categorical),
    ('bin', 'passthrough', binary)
])

🤖 Train Models

In [7]:
results = []

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Support Vector Machine': SVR(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
}

for model_name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)

    train_rmse = root_mean_squared_error(y_train, y_pred_train)
    test_rmse = root_mean_squared_error(y_test, y_pred_test)
    
    results.append({
        'Model': model_name,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
    })

results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,Train RMSE,Test RMSE
0,Linear Regression,208844.726508,197167.777538
1,Random Forest,68120.380641,175523.39202
2,XGBoost,100873.660769,170416.144392
3,Support Vector Machine,380435.578294,365782.006885
4,k-Nearest Neighbors,162638.299886,197457.453076
