In [None]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

# Loading the preprocessed data to aid in creating visualizations
data_full = pd.read_csv('data/curated/final_data_full.csv')
data_num = pd.read_csv('data/curated/final_data_num.csv') 

1. Feature importance graph

In [2]:
# Selecting the features that have an impact on rental prices
X = data_full.drop('price', axis=1) 
y = data_full['price'].values  

# Training the random forest model to identify which features contribute the most to the prediction
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Deriveingfeature importance scores from the Random Forest model
importance_scores = rf_model.feature_importances_

# Creating a DataFrame to rank features by importance and to store the feature names and their corresponding importance scores
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance_scores})

# Sorting the DataFrame in descending order of importance, so that the most important features appear first
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Plotting the feature importance graph
plt.figure(figsize=(8, 6))
bars = plt.bar(feature_importance['Feature'], feature_importance['Importance'], color='#FF9999')

# Adding annotations to each bar to display the importance value
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height / 2, f'{height:.2f}', ha='center', va='bottom', color='black')

# Setting titles and labels
plt.title('Feature Importance Graph (Predicting Rental Price)')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.xticks(rotation=45)

# Adjusting layout to prevent overlap
plt.tight_layout()

# Show the plot to display the relative importance of each feature used in the random forest model
plt.show()

2. Correlation heatmap between features

In [None]:
# Calculating the correlation matrix
corr_matrix = data_full.corr()

#Plotting the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt='.2f')
plt.title('Correlation Heatmap Between Features')
plt.show()

3. Median rental prices of top 7 liveable and affordable suburbs

In [8]:
# Grouping by suburb to calculate median rental prices
median_prices = data_full.groupby('suburb')['price'].median()

# Sorting the median prices in ascending order to find the top 7 suburbs with the lowest median rental prices
median_prices = median_prices.sort_values(ascending=True)

# Setting up the plot
plt.figure(figsize=(10, 6))  

# Plotting the median rental prices for the top 7 affordable suburbs
bars = plt.bar(median_prices.index[:7], median_prices.values[:7], color='#FF6666')  # Setting the color for the bars

# Adding titles and labels
plt.title('Median Rental Prices of Top 7 Liveable and Affordable Suburbs')
plt.xlabel('Suburb')
plt.ylabel('Median Rental Price ($)')

# Rotating the x-axis labels for better readability
plt.xticks(rotation=45)

# Adding data labels (text) on the bars to display the exact median price value for each suburb
for bar in bars:
    height = bar.get_height() 
    plt.text(bar.get_x() + bar.get_width() / 2.0, height / 2, f'{height:.2f}', ha='center', va='center', color='black')

# Final adjustments to the layout
plt.tight_layout()

# Step 9: Displaying the plot as a bar chart with the calculated median rental prices
plt.show()

4. Liveability scores of top 7 affordable suburbs

In [None]:
# Calculate liveability score for each suburb based on the number of schools, parks, shopping centers, and stations
# Liveability score will be considered as the sum of nearby amenities
data_liveability['liveability_score'] = (
    data_liveability['school_count'] +  
    data_liveability['park_count'] +  
    data_liveability['shopping_center_count'] +  
    data_liveability['station_count']  
)

# Grouping by suburb to calculate total liveability score per suburb and sort in descending order
liveability_scores = data_liveability.groupby('suburb')['liveability_score'].sum()

# Sorting liveability scores to get the top 7 affordable suburbs based on liveability
top_liveability_scores = liveability_scores.sort_values(ascending=False).head(7)

# Visualizing the liveability scores of these top 7 affordable suburbs
plt.figure(figsize=(10, 6))

# Creating a bar chart with suburbs on the x-axis and their respective liveability scores on the y-axis
bars = plt.bar(top_liveability_scores.index, top_liveability_scores.values, color='#66B2FF')

# Setting the title and labels for the graph
plt.title('Liveability Scores of Top 7 Affordable Suburbs')
plt.xlabel('Suburb')
plt.ylabel('Liveability Score')
plt.xticks(rotation=45) 

# Adding labels to each bar to show the exact liveability score value
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2.0, height / 2, f'{height:.1f}', ha='center', va='center', color='black')

# Displaying the plot with proper layout
plt.tight_layout()
plt.show()