In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/cleaned_house_sales.csv')

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Encode categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['city', 'house_type'], drop_first=True)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_encoded['area'] = imputer.fit_transform(df_encoded[['area']])
df_encoded['months_listed'] = imputer.fit_transform(df_encoded[['months_listed']])
df_encoded['bedrooms'] = imputer.fit_transform(df_encoded[['bedrooms']])

# Scale numerical features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_encoded[['area', 'months_listed', 'bedrooms']]),
                         columns=['area', 'months_listed', 'bedrooms'])

# Combine encoded and scaled features with original dataframe
df_preprocessed = pd.concat([df_encoded.drop(['area', 'months_listed', 'bedrooms'], axis=1), df_scaled], axis=1)

# Display the preprocessed dataframe
print(df_preprocessed.head())

   house_id  sale_price   sale_date  city_Riverford  city_Silvertown  \
0   1217792       55943  2021-09-12           False             True   
1   1900913      384677  2021-01-17           False             True   
2   1174927      281707  2021-11-10            True            False   
3   1773666      373251  2020-04-13           False             True   
4   1258487      328885  2020-09-24           False             True   

   city_Teasdale  city_Unknown  house_type_Semi-detached  house_type_Terraced  \
0          False         False                      True                False   
1          False         False                     False                False   
2          False         False                     False                False   
3          False         False                     False                False   
4          False         False                     False                False   

       area  months_listed  bedrooms  
0 -1.673053      -0.259409 -1.428248  
1 

### Linear Regression

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Convert 'sale_date' to datetime format
df_preprocessed['sale_date'] = pd.to_datetime(df_preprocessed['sale_date'])

# Extract year, month, and day features from 'sale_date'
df_preprocessed['sale_year'] = df_preprocessed['sale_date'].dt.year
df_preprocessed['sale_month'] = df_preprocessed['sale_date'].dt.month
df_preprocessed['sale_day'] = df_preprocessed['sale_date'].dt.day

# Drop the original 'sale_date' column
df_preprocessed.drop(columns=['sale_date'], inplace=True)


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed.drop(columns=['sale_price']),
                                                    df_preprocessed['sale_price'], test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),   # Standardize numerical features
    ('model', LinearRegression())   # Linear regression model
])

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_mse = -cv_scores.mean()  # Take the negative of mean squared error

print('Mean squared error (Cross-Validation):', mean_mse)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions = pipeline.predict(X_test)

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, predictions)
print('Mean squared error (Test Set):', mse_test)

Mean squared error (Cross-Validation): 521901059.8037545
Mean squared error (Test Set): 496856809.7794371


### Random Forest Regressor

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),   # Standardize numerical features
    ('model', RandomForestRegressor())   # Random forest regression model
])

# Define the hyperparameter grid
param_grid = {
    'model__n_estimators': [100, 200, 300],           # Number of trees in the forest
    'model__max_depth': [10, 20, 30, None],           # Maximum depth of the trees
    'model__min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4],             # Minimum number of samples required at each leaf node
    'model__bootstrap': [True, False]                 # Whether bootstrap samples are used when building trees
}

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model and its performance
best_model = grid_search.best_estimator_
best_mse = -grid_search.best_score_

print('Best Parameters:', grid_search.best_params_)
print('Mean squared error (Cross-Validation):', best_mse)

Best Parameters: {'model__bootstrap': True, 'model__max_depth': 20, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 300}
Mean squared error (Cross-Validation): 245753725.7881964


### Gradient Boosting Regressor

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),   # Standardize numerical features
    ('model', GradientBoostingRegressor())   # Gradient boosting regression model
])

# Define the hyperparameter grid
param_grid = {
    'model__n_estimators': [100, 200, 300],           # Number of trees in the forest
    'model__max_depth': [10, 20, 30, None],           # Maximum depth of the trees
    'model__min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4],             # Minimum number of samples required at each leaf node
    'model__learning_rate': [0.1, 0.01, 0.001]       # Learning rate
}

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model and its performance
best_model = grid_search.best_estimator_
best_mse = -grid_search.best_score_

print('Best Parameters:', grid_search.best_params_)
print('Mean squared error (Cross-Validation):', best_mse)

Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 100}
Mean squared error (Cross-Validation): 285956687.158507


### XGBoost Regressor

In [24]:
from xgboost import XGBRegressor

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),   # Standardize numerical features
    ('model', XGBRegressor())   # XGBoost regression model
])

# Define the hyperparameter grid
param_grid = {
    'model__n_estimators': [100, 200, 300],           # Number of trees in the forest
    'model__max_depth': [10, 20, 30, None],           # Maximum depth of the trees
    'model__learning_rate': [0.1, 0.01, 0.001]       # Learning rate
}

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model and its performance
best_model = grid_search.best_estimator_
best_mse = -grid_search.best_score_

print('Best Parameters:', grid_search.best_params_)
print('Mean squared error (Cross-Validation):', best_mse)

Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': None, 'model__n_estimators': 100}
Mean squared error (Cross-Validation): 279458378.94457513
