# **UK HOUSE PRICES REDICTION**

**Import Libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import r2_score, mean_squared_error
from typing import Union
import joblib
import time
#!pip install category_encoders
import category_encoders as ce

**Defining Functions**

Creating all functions which will be used in our machine learning workflow to
train our model, perform exploratory data analysis, save our model, and perform
visualizations on the bean dataset.

In [3]:
def build_regressor_model(regressor,
                          x_train: pd.DataFrame,
                          y_train: pd.DataFrame,
                          x_test: pd.DataFrame,
                          y_test: pd.DataFrame,
                          kfold: int = 10):
    # Model Training
    model = regressor.fit(x_train, y_train)

    # Model Prediction
    y_pred = model.predict(x_train) # Training Predictions: Check OverFitting
    y_pred1 = model.predict(x_test) # Test Predictions: Check Model Predictive Capacity

    # Model Evaluation
    # Training Evaluation: Check OverFitting
    training_rsquared = r2_score(y_train, y_pred)
    training_rmse = np.sqrt(mean_squared_error(y_train, y_pred))

    # Test Evaluations: Check Model Predictive Capacity
    test_rsquared = r2_score(y_test, y_pred1)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred1))

    # Validation of Predictions
    cross_val = cross_val_score(model, x_train, y_train, cv = kfold)
    cross_validation = cross_validate(model,
                                      x_train,
                                      y_train,
                                      cv = kfold,
                                      return_estimator = True,
                                      return_train_score = True)
    score_mean = round((cross_val.mean() * 100), 2)
    score_std_dev = round((cross_val.std() * 100), 2)

    # Visualization
    # Visualising the actual testing data and predicted values
    plt.figure(figsize=(15, 10))
    plt.grid(True)
    plt.scatter(y_test, y_pred1, color='blue', alpha=0.5, label = {"Test RMSE": test_rmse})
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "--c", label = {"Test R-Squared": test_rsquared})
    plt.title(f'Analyzing the Actual values against the Predicted Values - {regressor.__class__.__name__}')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.legend()
    plt.show()

    return {
        "Model": model,
        "Predictions": {
            "Actual Training Y": y_train,
            "Actual Test Y": y_test,
            "Predicted Training Y": y_pred,
            "Predicted Test Y": y_pred1
            },
        "Training Evaluation": {
            "Training R2": training_rsquared,
            "Training RMSE": training_rmse
            },
        "Test Evaluation": {
            "Test R2": test_rsquared,
            "Test RMSE": test_rmse
            },
        "Cross Validation": {
            "Cross Validation Mean": score_mean,
            "Cross Validation Standard Deviation": score_std_dev,
            "Validation Models": cross_validation
            }
        }

In [None]:
def build_multiple_regressors(regressors: Union[list or tuple],
                              x_train: pd.DataFrame,
                              y_train: pd.DataFrame,
                              x_test: pd.DataFrame,
                              y_test: pd.DataFrame,
                              kfold: int = 10):
    multiple_regressor_models = {} # General store for all metrics from each algorithm
    store_algorithm_metrics = [] # Store all metrics gotten from the algorithm at each iteration in the loop below
    dataframe = pd.DataFrame(columns = ["Algorithm",
                                        "Fit time",
                                        "Score time",
                                        "Test score",
                                        "Train score"]) # Store cross validation metrics

    # Creating a dataframe for all classifiers
    # ---> Loop through each classifier ain classifiers and do the following
    for algorithms in regressors:
        store_cross_val_models = {}

        # Call the function build_classifier_model to get classifier metrics
        print(f"Building regressor model and metrics for {algorithms.__class__.__name__} model.")
        multiple_regressor_models[f"{algorithms.__class__.__name__}"] = build_regressor_model(regressor = algorithms,
                                                                                              x_train = x_train,
                                                                                              y_train = y_train,
                                                                                              x_test = x_test,
                                                                                              y_test = y_test,
                                                                                              kfold = kfold)

        # Collecting individual metric to build algorithm dataframe
        training_r2 = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Training Evaluation"]["Training R2"]
        training_rmse = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Training Evaluation"]["Training RMSE"]
        test_r2 = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Test Evaluation"]["Test R2"]
        test_rmse = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Test Evaluation"]["Test RMSE"]
        cross_val_mean = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Cross Validation"]["Cross Validation Mean"]
        cross_val_std = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Cross Validation"]["Cross Validation Standard Deviation"]

        # Collecting indiviual metric to build cross validation dataframe
        cross_val_fit_time = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Cross Validation"]["Validation Models"]["fit_time"]
        cross_val_score_time = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Cross Validation"]["Validation Models"]["score_time"]
        cross_val_test_score = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Cross Validation"]["Validation Models"]["test_score"]
        cross_val_train_score = multiple_regressor_models[f"{algorithms.__class__.__name__}"]["Cross Validation"]["Validation Models"]["train_score"]

        # Storing all individual algorithm metrics from each iteration
        store_algorithm_metrics.append([algorithms.__class__.__name__,
                                        training_r2,
                                        training_rmse,
                                        test_r2,
                                        test_rmse,
                                        cross_val_mean,
                                        cross_val_std])
        # Storing all individual cross validation metrics from each iteration
        store_cross_val_models["Algorithm"] = algorithms.__class__.__name__
        store_cross_val_models["Fit time"] = cross_val_fit_time
        store_cross_val_models["Score time"] = cross_val_score_time
        store_cross_val_models["Test score"] = cross_val_test_score
        store_cross_val_models["Train score"] = cross_val_train_score
        # Creating dataframe for cross validation metric
        data_frame = pd.DataFrame(store_cross_val_models)
        dataframe = pd.concat([dataframe, data_frame])
        print("Model building completed.\n")

    # Creating dataframe for algorithm metric
    df = pd.DataFrame(store_algorithm_metrics, columns = ["Algorithm",
                                                          "Training R2",
                                                          "Training RMSE",
                                                          "Test R2",
                                                          "Test RMSE",
                                                          "CV Mean",
                                                          "CV Standard Deviation"])
    # Save datasets in folder for analysis
    save_dataframe(dataset = dataframe, name = "Cross_Validation_Evaluation")
    save_dataframe(dataset = df, name = "Algorithm_Evaluation")

    return (df, dataframe, multiple_regressor_models)

In [None]:
def eda(dataset: pd.DataFrame, graphs: bool = False) -> dict:
    """
    Perform exploratory data analysis on the dataset.

    Parameters
    ----------
    dataset : pd.DataFrame
        Dataset to perform EDA.
    graphs : bool, optional
        Choose to display exploratory data analysis visuals. The default is False.

    Returns
    -------
    dict
        A dictionary containing different evaluation metrics for exploring the
        columns and understanding how values in the dataset are distributed.

    """
    data_unique = {}
    data_category_count = {}
    dataset.info()
    data_head = dataset.head()
    data_tail = dataset.tail()
    data_mode = dataset.mode().iloc[0]
    data_descriptive_stats = dataset.describe()
    data_more_descriptive_stats = dataset.describe(include = "all",
                                                   datetime_is_numeric=True)
    data_correlation_matrix = dataset.corr(numeric_only = True)
    data_distinct_count = dataset.nunique()
    data_count_duplicates = dataset.duplicated().sum()
    data_count_null = dataset.isnull().sum()
    data_total_null = dataset.isnull().sum().sum()
    for each_column in dataset.columns: # Loop through each column and get the unique values
        data_unique[each_column] = dataset[each_column].unique()
    for each_column in dataset.select_dtypes(object).columns:
        # Loop through the categorical columns and count how many values are in each category
        data_category_count[each_column] = dataset[each_column].value_counts()

    if graphs == True:
        # Visuals
        dataset.hist(figsize = (25, 20), bins = 10)
        plt.figure(figsize = (15, 10))
        sns.heatmap(data_correlation_matrix, annot = True, cmap = 'coolwarm')
        plt.show()
        plt.figure(figsize = (50, 30))
        sns.pairplot(dataset) # Graph of correlation across each numerical feature
        plt.show()

    result = {"data_head": data_head,
              "data_tail": data_tail,
              "data_mode": data_mode,
              "data_descriptive_stats": data_descriptive_stats,
              "data_more_descriptive_stats": data_more_descriptive_stats,
              "data_correlation_matrix": data_correlation_matrix,
              "data_distinct_count": data_distinct_count,
              "data_count_duplicates": data_count_duplicates,
              "data_count_null": data_count_null,
              "data_total_null": data_total_null,
              "data_unique": data_unique,
              "data_category_count": data_category_count,
              }
    return result

In [None]:
def save_dataframe(dataset: pd.DataFrame, name: str):
    """
    Save the data to the generated_data folder.

    Parameters
    ----------
    dataset : pd.DataFrame
        Dataset containing the information we want to save. For this project,
        it could be a dataframe of algorithm metrics or cross validation metrics.
    name: str
        A string indicating the name of the dataset and how it should be saved.

    Returns
    -------
    None.

    """
    try:
        data_name = name
        date = time.strftime("%Y-%m-%d")
        dataset.to_csv(f"../../Datasets/generated_data/section1/{data_name}_{date}.csv", index = False)
        print("\nSuccessfully saved file to the specified folder ---> generated_data folder.")
    except FileNotFoundError:
        print("\nFailed to save file to the specified folder ---> generated_data folder.")


In [None]:
def save_model_from_cross_validation(models_info: dict, algorithm: str, index: None):
    model_to_save = models_info[algorithm]["Cross Validation"]["Validation Models"]["estimator"][index]

    # Using Joblib to save the model in our folder
    joblib.dump(model_to_save, f"models/{algorithm}_Model_{index}.pkl")
    print(f"\nThis model is gotten from cross validating with the {algorithm} algorithm at iteration {index + 1}.")
    return models_info[algorithm]["Cross Validation"]["Validation Models"]["estimator"][index]


**QUESTIONS**

You are required to carry out the following tasks:

a. Identify and address any issues in the dataset. Then conduct exploratory data
analysis on the dataset (10 marks)

b. Analyse the relationships between price and the other features. What are your
conclusions? (10 marks)

c. Analyse and visualise the impact of Brexit on house prices. What are your
conclusions? (10 marks)

d. Build a predictive model to estimate house prices. You need to show how to:

        • Use appropriate methods to select relevant features.
        • Split the dataset into training and testing sets.
        • Train at least 3 different prediction models.
        • Evaluate the models with the performance, and report in terms of R2 and
        RMSE.
        • Visualise the actual testing data and predicted values.
        • State your conclusions on the models.
        • Save your best model.


BEFORE ANY SOLUTION, WE WOULD NEED TO IMPORT OUR DATASET

In [None]:
# Get the dataset
dataset = pd.read_csv("../../Datasets/Coursework Datasets/UK_Housing_Data.csv")

print(dataset)

**SOLUTION A**

Perform exploratory data analysis to gain insight on the dataset and possible issues that need to be handled.

In [None]:
# Exploratory data analysis
initial_eda = eda(dataset, graphs = True)

**Data cleaning and Transformation**
    # --- Issues:
1) We have 1828855 missing values initially in our data. After doing some initial data preprocessing, we are left with 23741 missing values in our data to handle.
  * One possible fix is using the SimpleImputer from sklearn.impute or KNNImputer from sklearn.impute.
  * Another fix could be to drop missing rows.

2) TID (Transaction Identifier) has curly braces around each value.
  * Remove them using .replace()

3) We have some irrelevant columns for the house prices analysis. Some of the
irrelevant columns we have include:

    a) Unnamed: 0
    b) TID
    c) SAON
    d) PAON
    e) Record Status

The features SAON and PAON are considered irrelevant as the house number doesn't help us in predicting the price of a house, neither does the flat number. The SAON feature also has over a million rows missing in the dataset.

The feature TID (Transaction ID) is irrelevant for this analysis given it is just an identifier for the transactions.

The feature Record Status has one unique value (A), therefore, this won't be useful for gaining any insights given the absolute value is always (A) meaning variance is zero.
  * Drop irrelevant columns using the .drop() pandas command.

4) TDate (Transaction Date) is given of type object.
  * This is false and needs to be replaced with type datetime64.
    
5) Drop the LOCALITY column. More than half of the data is missing in that column.
    
6) Remove duplicate columns created after the above preprocessing steps. 2409 duplicated values were created that need to be dropped.
  * Using the .drop_duplicates command solves this problem.

7) The TDate column needs to be processed to extract date features such as year, month, and day, for our model creation.
    
8) Drop categorical columns that aren't correlated with Price to avoid create a complex model and introducing noise into our model, further reducing our models predictive power.

9) Handling the categorical features for prediction. This process has a huge influence on the predictive power of our model, hence it is very crucial.
  * One possible solution will be to use LabelEncoder from sci-kit learn or TargetEncoder from category_encoders.

In [None]:
# TID has curly braces --- Remove Them
dataset["TID"] = dataset["TID"].replace({"{": "",
                                          "}": ""})
# Drop irrelevant columns
dataset = dataset.drop(["TID",
                        "Unnamed: 0",
                        "SAON",
                        "PAON",
                        "RecordStatus"], axis = 1)

# Drop locality columns due to more than half of the data in the column missing
dataset = dataset.drop(["Locality"], axis = 1)

# Storing clean dataset for visualization
data = dataset

# Transaction date to datetime64
dataset["TDate"] = pd.to_datetime(dataset["TDate"])

# Drop duplicates
dataset = dataset.drop_duplicates()

# Fix missing values
dataset = dataset.dropna()

Next, we create some hierarchy in our data given we have time sensitive information using the TDate column

In [None]:
# Sort Dates column
dataset = dataset.sort_values("TDate", ascending = True)

print(dataset)

We then perform some feature engineering to extract time related features from the TDate column

In [None]:
# Extract time features
dataset["Year"] = dataset["TDate"].dt.year
dataset["Month"] = dataset["TDate"].dt.month
dataset["Day"] = dataset["TDate"].dt.day

print(dataset)

**Further Data Preparation and Segregation**

Next, we perform some further data preparation and segregation in this section. These includes:

    * Selecting dependent and independent variables
    * Encoding categorical features
    * Feature selection
    * Splitting the data into training and testing dataset


Selecting dependent and independent variables

In [None]:
X = dataset.drop(["Price"], axis=1)
y = dataset.Price

print(X)
print("\n\n")
print(y)

Encoding categorical features

In [None]:
encoder = ce.TargetEncoder(smoothing = 50, min_samples_leaf = 20)
X = encoder.fit_transform(X, y)

print(X)

**SOLUTION B**

Finding the correlation between the encoded features and price. Checks for linear relationship between the features and price. For checking correlation, we use Pearson Product Moments.

We will find the correlation between these features before feature selection.

In [None]:
correlation_between_features = pd.concat([X, y], axis = 1).corr()

print(correlation_between_features)

**CONCLUSIONS**

From our analysis of the correlation matrix, insights show a positive linear relationship between postcode and price, as well as street and price. All other features don't have either no correlation with price or a relatively low correlation with price.

Including these uncorrelated columns into our model can introduce noise and lead to a complex model. The features postcode and street can be used to implement linear models with high accuracy given the approximate 0.97 and 0.67 positive relationship with price respectively.

These two features can succesfully capture the linear variations in price measured by r-squared.

**Exploratory Data Analysis**

Conducting EDA to get a view of our dataset after data cleaning and transformation to validate steps and approach before model building.

In [None]:
# EDA
data_eda = eda(correlation_between_features)

**SOLUTION C**

Analysing the impact of Brexit on UK house prices involves 4 approaches. Hence, we break our analysis into 4 sections, then a final conclusion.

    --- SECTION 1: Analyzing yearly average house prices during UK Brexit transition

In [None]:
year_data = dataset[["Price", "TDate", "Year", "Month", "Day"]]
year_data = year_data.drop(["TDate", "Month", "Day"], axis = 1)
year_data = year_data.groupby("Year").mean(numeric_only = True).reset_index()

segment_ranges = [(2010, 2016), (2016, 2020), (2020, 2023)]
colors = ['blue', 'red', 'green']
info = ["Before Referendum", "During Negotiations for Brexit", "Brexit Begins"]

print(year_data)

In [None]:
# Visualization
plt.figure(figsize = (35, 10))
plt.title("Analyzing Yearly Average Prices in UK")
plt.plot(year_data["Year"], year_data["Price"], "--g", marker = 'o', markersize = 10, alpha = 0.5)
plt.axvline(x = 2016, color='cyan', linestyle='--', linewidth=2, alpha = 0.9, label = "Referendum for Brexit")
plt.axvline(x = 2020, color='cyan', linestyle='--', linewidth=2, alpha = 0.9, label = "UK leaves European Union(EU)")
for i, (start, end) in enumerate(segment_ranges):
    segment_data = year_data[(year_data["Year"] >= start) & (year_data["Year"] <= end)]
    plt.plot(segment_data["Year"], segment_data["Price"], marker='o', markersize=10, color=colors[i], label = info[i])
plt.xlabel("Year", labelpad = 20)
plt.ylabel("Average Price", labelpad = 20)
plt.legend()
plt.show()

    --- SECTION 2: Analyzing monthly average house prices during UK Brexit transition

In [None]:
month_data = dataset
month_data["Year-Month"] = month_data["Year"].astype(str) + "-" + month_data["Month"].astype(str)
month_data = month_data.drop(["TDate", "Month", "Year", "Day"], axis = 1)
month_data = month_data.groupby("Year-Month").mean(numeric_only = True).reset_index()

segment_ranges1 = [("2010-1", "2016-1"), ("2016-1", "2020-2"), ("2020-2", "2023-4")]
colors = ['blue', 'red', 'green']

print(month_data)

In [None]:
# Visualization
plt.figure(figsize = (45, 10))
plt.title("Analyzing Monthly Average Prices in UK")
plt.plot(month_data["Year-Month"], month_data["Price"], "--g", marker = 'o', markersize = 5, alpha = 0.5)
plt.axvline(x = "2016-1", color='cyan', linestyle='--', linewidth=2, alpha = 0.9, label = "Referendum for Brexit")
plt.axvline(x = "2020-2", color='cyan', linestyle='--', linewidth=2, alpha = 0.9, label = "UK leaves European Union(EU)")
for i, (start, end) in enumerate(segment_ranges1):
    segment_data1 = month_data[(month_data["Year-Month"] >= start) & (month_data["Year-Month"] <= end)]
    plt.plot(segment_data1["Year-Month"], segment_data1["Price"], marker='o', markersize=10, color=colors[i], label = info[i])
plt.xticks(rotation = 90, ha ='right')
plt.xlabel("Months", labelpad = 20)
plt.ylabel("Average Price", labelpad = 20)
plt.legend()
plt.show()

    --- SECTION 3: Analyzing maximum house prices before referendum for brexit, during the transition and after brexit

In [None]:
before_referendum_brexit = round(year_data[year_data["Year"] <= 2016]["Price"].max(), 2)
during_negotiations_brexit = round(year_data[(year_data["Year"] > 2016) & (year_data["Year"] <= 2020)]["Price"].max())
after_brexit = round(year_data[year_data["Year"] > 2020]["Price"].max())
y_max = [before_referendum_brexit, during_negotiations_brexit, after_brexit]
X_max = ["Before Referendum", "During Negotiations for Brexit", "Brexit Begins"]

print("Before Referendum -", before_referendum_brexit)
print("During Negotiations for Brexit -", during_negotiations_brexit)
print("Brexit Begins -", after_brexit)

In [None]:
plt.figure(figsize = (15, 10))
plt.title("Analyzing Maximum Yearly Average House Prices in UK during Brexit Transition")
container = plt.bar(X_max, y_max, width = 0.5, color = colors, alpha = 0.5)
plt.bar_label(container, labels = y_max, padding = 10)
plt.xlabel("Brexit", labelpad = 20)
plt.ylabel("Maximum House Price", labelpad = 20)
plt.show()

    --- SECTION 4: Analyzing house prices before referendum for brexit, during the transition and after brexit

In [None]:
before_referendum_brexit_min = round(year_data[year_data["Year"] <= 2016]["Price"].min(), 2)
during_negotiations_brexit_min = round(year_data[(year_data["Year"] > 2016) & (year_data["Year"] <= 2020)]["Price"].min())
after_brexit_min = round(year_data[year_data["Year"] > 2020]["Price"].min())
y_min = [before_referendum_brexit_min, during_negotiations_brexit_min, after_brexit_min]
X_min = ["Before Referendum", "During Negotiations for Brexit", "Brexit Begins"]

print("Before Referendum -", before_referendum_brexit_min)
print("During Negotiations for Brexit -", during_negotiations_brexit_min)
print("Brexit Begins -", after_brexit_min)

In [None]:
plt.figure(figsize = (15, 10))
plt.title("Analyzing Minimum Yearly Average House Prices in UK during Brexit Transition")
container_min = plt.bar(X_min, y_min, width = 0.5, color = colors, alpha = 0.5)
plt.bar_label(container_min, labels = y_min, padding = 10)
plt.xlabel("Brexit", labelpad = 20)
plt.ylabel("Minimum House Price", labelpad = 20)
plt.show()

    --- CONCLUSION

Brexit was a turning point in the history of the United Kingdom. It marked the day the United Kingdom broke away from European Union. On the **23rd of June, 2016**, a referendum was held to decide the position of the United Kingdom in realtion to Brexit. The transition and withdrawal from the European Union was one that went on for years. On the **31st of January, 2020**, the brexit agreements were finalised and came into force.

We attempt understanding the impact of Brexit on house prices in the United Kingdom. This
relationship is analysed using 4 approaches:

    - Average yearly house prices in the United Kingdom
    - Average monthly house prices in the United Kingdom
    - Average maximum yearly house prices in the United Kingdom
    - Average minimum yearly house prices in the United Kingdom
    
The average yearly house prices in the UK gives an overall idea and glance into how house prices fluctuate in 3 periods, these are:

    - Before Brexit
    - During Negotiations and Transition
    - After Brexit.
    
Across these periods we analysed, we notice a trend of prices generally peaking each year. Two significant points worth noticing were between the year 2016, the year negotiations for Brexit began. We see a spike in average house prices between then and 2017. This period recorded the highest shift in yearly average prices in the UK, going from **309,720** in **2016** to **346,982** in **2017**. The other period worth noticing is the peak yearly average house price in UK which occured in **2022**, the figure amounting to **393,173**.

The average monthly house prices in the UK gives a clearer picture of the fluctuations in house prices before brexit, during negotiations, and after brexit. The key take away from this analysis is the stability in the prices across different years before brexit compared to during negotiations, and after brexit. We see slight increase in the average house prices in the years before brexit, however, this increase has little oscillation across different months. During negotiations and the transition towards brexit, we start seeing strong oscillation in the average monthly house prices. The stablilty which we could see in our graph before brexit dwindles. The period after brexit sees the yearly average house prices in the UK reach it's peak while witnessing stronger oscillation of the prices. This flunctuations witnessed could be due to uncertainty, interest rates, affected trade relations, and other factors that the United Kingdom has had to deal with after brexit.

In our bar chart, we analyse the maximum and minimum yearly average house prices in the United Kingdom.
Our analysis focuses on the 3 periods previously specified. We see the following results:

    - Average maximum yearly house prices in the United Kingdom:
        * Before Brexit - 309,720
        * During Transition - 365,597
        * After Brexit - 393,173

    - Average minimum yearly house prices in the United Kingdom:
        * Before Brexit - 231,631
        * During Transition - 346,982
        * After Brexit - 352,649
From our analysis we see the periods before brexit have the minimum average yearly house prices in the United Kingdom, while the period after brexit has the maximum average yearly house pricesin the United Kingdom. The difference in the average maximum and minimum yearly house prices in the United Kingdom between the period before and after brexit illustrates the consisitent rise in house prices and the big impact brexit played in house price increase.

**NOTE:** All figures are in **POUNDS** which is the currency associated with the United Kingdom

**SOLUTION D**

Dropping the date column

In [None]:
# From the Feature X
X = X.drop("TDate", axis = 1)

Feature selection

In [None]:
# Feature selection - To select best features
selector = SelectKBest(f_regression, k = 2)
X = selector.fit_transform(X, y)

print(X)

In [None]:
# Highlighting features score gotten from SelectKBest during Feature Selection process
feature_importance = {feature: score for feature, score in zip(selector.feature_names_in_, selector.scores_)}

print(feature_importance)

Splitting data into training and testing dataset

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle = False)

Scaling the features using StandardScaler. Technique employed is standardization.

In [None]:
# Scaling the X Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Training at least 3 predictive models**

      --- Linear Regression
      --- KNN Regressor
      --- Gradient Boosting Regressor
      --- Stochastic Gradient Descent Regressor

**Model Building**

This is the model training, prediction and evaluation phase. This will be done using the function created "build_multiple_regressors" that creates a dataframe and stores all information during the model building phase. Finally we save our prefered model then provide some conclusions on our results

In [None]:
regressors = [LinearRegression(),
              KNeighborsRegressor(),
              SGDRegressor(loss='squared_error',
                            penalty='l2',
                            alpha=0.0001,
                            fit_intercept=True,
                            max_iter=1000,
                            tol=1e-3,
                            learning_rate='optimal',
                            early_stopping=False,
                            validation_fraction=0.1,
                            n_iter_no_change=5),
              Ridge(alpha = 3.0),
              Lasso(alpha = 3.0)]

In [None]:
algorithm_metrics, cross_validation_metrics, model_info = build_multiple_regressors(regressors, X_train, y_train, X_test, y_test)

print(algorithm_metrics)
print("\n\n")
print(cross_validation_metrics)

    --- Conclusion

Using the following algorithms, we create a predictive model for house prices in the United Kingdom.
    - Linear Regression
    - KNN Regression
    - Stochastic Gradient Descent Regression
    - Ridge Regression
    - Lasso Regression
We evaluate these algorithms with following metrics
    - Root Mean Squared Error (RMSE)
    - R-Squared (Coefficient of Determination)
    - Cross Validation Mean
    - Cross Validation Standard Deviation
    - Fit Time
    - Score Time
    
From our model building phase, Linear, Lasso, and Ridge regression perform best in training, testing, and have 
the best cross validation mean score of 92.49. In our analysis, we note the huge variation in the predictions of
the Stochastic Gradient Descent Regression as in it produces a test r-squared of -86.003 while having a cross validation
mean of 89.84. Therefore, it is not a reliable model. The KNN Regressor has the lowest R-Squared and Cross Validation mean
of 0.76 and 86.11 respectively. 

The Linear, Lasso, and Ridge Regression models have a Training R-Squared of 0.93 and a test R-Squared of 0.95 indcating 
how well it can generalize. When analysing their cross validation statistics, we see the best algorithm is the Lasso
regression as it records the lowest fit time and score time indicating how fast it is, while recording the highest
cross validation mean for test evaluation of 0.967182. This is slightly higher than the Ridge and Linear regression which
had similar cross validation test score of 0.967181. Regardless, Lasso regression remains our best model because it remains
the model with the best fit time and score time as well.

The model with the worst fit to our data with a test cross validation score of -124.536 is the Stochastic Gradient Descent
algorithm.

In [None]:
# Saving the best model
save_model = save_model_from_cross_validation(models_info = model_info, 
                                              algorithm = "Lasso", 
                                              index = 9)