In [2]:
#Exploratory Data Analysis (EDA)
#Created 06/20/22 10:00am
#Author: Kahin Akram Hassan

#====================================Docs=====================================
%matplotlib inline
import sys
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import numpy as np

#Dash 
from dash import Dash, html, dcc, Input, Output
import dash_bootstrap_components as dbc
import geopandas as geopd
import yaml

sys.path.append("../")
from src.data_prep import data_helpers as data
#Dash layouts and callbacks
from src.dash_prep import layouts as layout, callbacks

ModuleNotFoundError: No module named 'src'

In [None]:
app = Dash(
    __name__,
    suppress_callback_exceptions=True,
    external_stylesheets=[dbc.themes.ZEPHYR],
    meta_tags=[{
        'name':'viewport',
        'content':'width=device-width, initial-scale=1'
    }]
)

#Load the navbar 
navbar = layout.navbar
#Dynamic part of the app
content = html.Div(id="page-content")
#Set the app layout 
app.layout = dbc.Container(children=[
    dcc.Location(id="url",refresh=False),
    navbar,
    content
], fluid=False)



In [None]:
# Fetch and load the data
data.load_housing_data()
housing = data.read_csv_file()

In [None]:
# Split the data into train and test sets
train_set, test_set = data.split_train_test_id(housing,0.2,'id')
print(len(train_set), 'Train + ', len(test_set), 'test')

In [None]:
# Stratified sampling based on the income category
strat_train_set, strat_test_set = data.stratisfied_split_train_test(housing,0.2,"income_cat")

In [None]:
housing = strat_train_set.copy()

In [None]:
with open("../config/mapbox_token.yml","r") as token:
    mapbox_token = yaml.load(token,Loader=yaml.FullLoader)
    

In [None]:
    
px.set_mapbox_access_token(mapbox_token)

fig = px.scatter_mapbox(
    housing, 
    lat="latitude", 
    lon="longitude",
    color="median_house_value", 
    size=housing["population"]/100, #district's population scaled. 
    color_continuous_scale=px.colors.cyclical.IceFire, 
    size_max=15,
    zoom=4,
    opacity=0.5,
    width=800,
    center=dict(
    lat=37.5,
    lon=-120.0
    ),
)
fig.show()

In [None]:
# Let's look into the Correlations 
# First calculate the correlations of the df 
df_corr = housing.corr()

#mask only half of the heatmap
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr = df_corr.mask(mask)
df_corr.drop(['id','longitude','latitude'],axis=1,inplace=True)
df_corr.drop(['id','longitude','latitude'],axis=0,inplace=True)
z = np.array(df_corr)

In [None]:

fig = ff.create_annotated_heatmap(
    x=df_corr.columns.values.tolist(),
    y=df_corr.index.values.tolist(),
    z=z,
    annotation_text = np.around(z, decimals=2),
    colorscale='rdBu',
    showscale=True,
    reversescale=True,
    
)
fig.update_xaxes(side="bottom")
fig.update_layout(
    title_text='Correlation between variables for train dataset', 
    title_x=0.5, 
    width=1200, 
    height=400,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_dark'
)


fig.show()

In [None]:
if __name__=='__main__':
    app.run_server(port=8050, use_reloader=False)