In [40]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 4000

In [41]:
df = pd.read_csv('../data/curated/realestate_with_closest_distance_duration.csv')

df = df[['id','propertyType','furnished','price','bedrooms','bathrooms','parkingSpaces',
'studies','closest_primary_distance','closest_primary_duration','closest_secondary_distance',
'closest_secondary_duration','closest_train_distance','closest_train_duration','closest_tram_distance',
'closest_tram_duration','closest_bus_distance','closest_bus_duration','closest_park_distance','closest_park_duration']]


In [42]:
# transform category features to numeric, which are 'propertyType' and 'furnished'
label_encoder = LabelEncoder()
df['propertyType'] = label_encoder.fit_transform(df['propertyType'])
df['furnished'] = label_encoder.fit_transform(df['furnished'])


In [43]:
corr_mat = df.corr(method='pearson')
upper_corr_mat = corr_mat.where(
    np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool_))
unique_corr_pairs = upper_corr_mat.unstack().dropna() 
sorted_mat = unique_corr_pairs.sort_values()
print(sorted_mat)

bedrooms                    furnished                    -0.205907
parkingSpaces               furnished                    -0.194681
closest_tram_distance       price                        -0.189388
closest_park_duration       price                        -0.182001
closest_tram_duration       price                        -0.178838
closest_park_distance       price                        -0.173850
                            furnished                    -0.158020
closest_tram_duration       furnished                    -0.147528
closest_park_duration       furnished                    -0.145727
closest_tram_distance       furnished                    -0.135649
furnished                   propertyType                 -0.129868
                            id                           -0.124430
closest_train_duration      furnished                    -0.094646
                            price                        -0.094285
closest_train_distance      price                        -0.09

Summary: 
Under preliminary analysis, which is single feature pearson relationship with price, the most correlated featues are bedrooms and bathrooms, and it is not suprising.
Moreover, we have find out that the correlation between duration and distance is highly correlated, which means feature selection should be implemented in modelling step.