## Part 1: Loading and Cleaning Data

In [25]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('forecast_history.csv')

# Clean the 'Westpac: 4 year forecast', 'Joe Bloggs: 2 year forecast', and 'Harry Spent: 5 year forecast' columns
def clean_percentage_column(column):
    return pd.to_numeric(column.str.replace('%', '').replace({'I5': '15', 'NaN': None}), errors='coerce')

df['Westpac: 4 year forecast'] = clean_percentage_column(df['Westpac: 4 year forecast'])
df['Joe Bloggs: 2 year forecast'] = clean_percentage_column(df['Joe Bloggs: 2 year forecast'])
df['Harry Spent: 5 year forecast'] = clean_percentage_column(df['Harry Spent: 5 year forecast'])

# Clean the 'Median house price' column
df['Median house price'] = pd.to_numeric(df['Median house price'], errors='coerce')
df.dropna(subset=['Median house price'], inplace=True)

# Rename the 'Unnamed: 0' column to 'Year'
df.rename(columns={'Unnamed: 0': 'Year'}, inplace=True)

df.head()

Unnamed: 0,Year,Median house price,Westpac: 4 year forecast,Joe Bloggs: 2 year forecast,Harry Spent: 5 year forecast
0,2011,340000.0,56.0,23.0,-20.0
1,2012,370000.0,53.0,,-80.0
2,2013,350000.0,,19.0,-70.0
3,2014,420000.0,13.0,42.0,-80.0
4,2015,425000.0,33.0,23.0,-50.0


## Part 2: Building the Baseline Model

In [26]:
from sklearn.linear_model import LinearRegression

# Prepare data for the baseline model
X = np.array(df['Year']).reshape(-1, 1)
y = df['Median house price']

# Create and fit a baseline forecast model (linear regression)
baseline_model = LinearRegression()
baseline_model.fit(X, y)

# Get the baseline predictions
baseline_predictions = baseline_model.predict(X)

# Display the baseline predictions
baseline_predictions

array([394373.62637363, 407285.71428571, 420197.8021978 , 433109.89010989,
       446021.97802198, 458934.06593407, 471846.15384615, 484758.24175824,
       497670.32967033, 510582.41758242, 523494.50549451, 536406.59340659,
       549318.68131868])

## Part 3: Calculating Forecast Errors

In [27]:
# Calculate forecast errors for each forecaster
df['Westpac Error'] = abs(df['Median house price'] * (1 + df['Westpac: 4 year forecast'] / 100) - df['Median house price'])
df['Joe Bloggs Error'] = abs(df['Median house price'] * (1 + df['Joe Bloggs: 2 year forecast'] / 100) - df['Median house price'])
df['Harry Spent Error'] = abs(df['Median house price'] * (1 + df['Harry Spent: 5 year forecast'] / 100) - df['Median house price'])
df['Baseline Error'] = abs(df['Median house price'] - baseline_predictions)

# Display the forecast errors
df[['Year', 'Westpac Error', 'Joe Bloggs Error', 'Harry Spent Error', 'Baseline Error']]

Unnamed: 0,Year,Westpac Error,Joe Bloggs Error,Harry Spent Error,Baseline Error
0,2011,190400.0,78200.0,68000.0,54373.626374
1,2012,196100.0,,296000.0,37285.714286
2,2013,,66500.0,245000.0,70197.802198
3,2014,54600.0,176400.0,336000.0,13109.89011
4,2015,140250.0,97750.0,212500.0,21021.978022
5,2016,25000.0,75000.0,450000.0,41065.934066
6,2017,234000.0,7800000.0,156000.0,48153.846154
7,2018,187000.0,99000.0,,65241.758242
8,2019,202640.0,113240.0,655600.0,98329.67033
9,2020,122000.0,140300.0,549000.0,99417.582418


## Part 4: Performing Statistical Tests

In [28]:
from scipy import stats

# Perform t-tests to compare forecast errors to the baseline model
forecasters = ['Westpac Error', 'Joe Bloggs Error', 'Harry Spent Error']
p_values = {}

for forecaster in forecasters:
    p_values[forecaster] = stats.ttest_ind(df[forecaster], df['Baseline Error'], nan_policy='omit').pvalue

# Display p-values from statistical tests
p_values

{'Westpac Error': 0.4653053613709093,
 'Joe Bloggs Error': 0.3184321185798018,
 'Harry Spent Error': 0.0017853613704437695}

## Part 5: Simulating Market Conditions

In [29]:
# Simulate market conditions by applying random percentage changes
np.random.seed(42)  # For reproducibility
df['Simulated Market'] = df['Median house price'] * (1 + np.random.uniform(-0.1, 0.1, size=len(df)))

# Display simulated market conditions
df[['Year', 'Simulated Market']]

Unnamed: 0,Year,Simulated Market
0,2011,331468.728082
1,2012,403352.858674
2,2013,366239.575927
3,2014,428287.312673
4,2015,395761.584438
5,2016,465599.452034
6,2017,474040.695665
7,2018,590279.376035
8,2019,608052.9094
9,2020,635384.854491


## Part 6: Final Display of Results

In [30]:
# Display forecast errors compared to the baseline model
print("Forecast errors compared to the baseline model:")
print(df[['Year', 'Westpac Error', 'Joe Bloggs Error', 'Harry Spent Error', 'Baseline Error']])

# Display p-values of statistical tests
print("\nStatistical test results (p-values):")
for forecaster, p_value in p_values.items():
    print(f"{forecaster} vs Baseline: {p_value}")

# Display simulated market conditions
print("\nSimulated market conditions and the impact on median house prices:")
print(df[['Year', 'Simulated Market']])

Forecast errors compared to the baseline model:
    Year  Westpac Error  Joe Bloggs Error  Harry Spent Error  Baseline Error
0   2011       190400.0           78200.0            68000.0    54373.626374
1   2012       196100.0               NaN           296000.0    37285.714286
2   2013            NaN           66500.0           245000.0    70197.802198
3   2014        54600.0          176400.0           336000.0    13109.890110
4   2015       140250.0           97750.0           212500.0    21021.978022
5   2016        25000.0           75000.0           450000.0    41065.934066
6   2017       234000.0         7800000.0           156000.0    48153.846154
7   2018       187000.0           99000.0                NaN    65241.758242
8   2019       202640.0          113240.0           655600.0    98329.670330
9   2020       122000.0          140300.0           549000.0    99417.582418
10  2021       132000.0           85800.0           396000.0   136505.494505
11  2022       108000.0     