In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load datasets

liveability = pd.read_csv("../data/predictor/Melbourne Liveability Indicators.csv")
rental_affordability = pd.read_csv("../data/response/rental_affordability.csv")

# Merge datasets on the year
merged_df = pd.merge(liveability_final, rental_affordability_final, on='year')

# Exploratory Data Analysis (EDA)
# Line Charts
plt.figure(figsize=(14, 7))
plt.plot(merged_df['year'], merged_df['percentage'], label='Liveability Score', marker='o')
plt.plot(merged_df['year'], merged_df['rai'], label='Rental Affordability Index', marker='o')
plt.title('Liveability Score and Rental Affordability Index Over Years')
plt.xlabel('Year')
plt.ylabel('Values')
plt.legend()
plt.grid(True)
plt.show()

# Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_df, x='percentage', y='rai')
plt.title('Scatter Plot of Liveability Score vs Rental Affordability Index')
plt.xlabel('Liveability Score')
plt.ylabel('Rental Affordability Index')
plt.grid(True)
plt.show()

# Correlation Analysis
correlation, _ = pearsonr(merged_df['percentage'], merged_df['rai'])
print(f'Pearson correlation coefficient: {correlation:.2f}')

# Regression Analysis
X = sm.add_constant(merged_df['percentage'])
y = merged_df['rai']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)

# Regression Line Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_df, x='percentage', y='rai', label='Data Points')
plt.plot(merged_df['percentage'], predictions, color='red', label='Regression Line')
plt.title('Regression Line: Liveability Score vs Rental Affordability Index')
plt.xlabel('Liveability Score')
plt.ylabel('Rental Affordability Index')
plt.legend()
plt.grid(True)
plt.show()

# Residual Plot
residuals = y - predictions
plt.figure(figsize=(10, 6))
sns.residplot(x=merged_df['percentage'], y=residuals, lowess=True, color="g")
plt.title('Residual Plot')
plt.xlabel('Liveability Score')
plt.ylabel('Residuals')
plt.grid(True)
plt.show()

# Summary of Regression
print(model.summary())
