# EDA

In [None]:
#import necessary libraries

#import warnings
#warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# from matplotlib.ticker import PercentFormatter
# plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
# plt.rcParams["figure.facecolor"]= "w"
# pd.plotting.register_matplotlib_converters()

#round all floats to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
#loading data into DataFrame
df_housing = pd.read_csv('data/King_County_House_prices_dataset.csv')

# # as column 'condition' is truncated in the display, disable column truncation
# pd.set_option('display.max_columns', None) 
# pd.set_option('display.width', None) 
# pd.set_option('display.max_colwidth', -1)

In [None]:
#examining the data
df_housing.head()

In [None]:
# Check for duplicates - no duplicates!
df_housing.duplicated().value_counts()

In [None]:
df_housing.shape

In [None]:
df_housing.info()
#21597 entries, 21 columns
# there are NaN entries

In [None]:
# how is the data indexed?
# standard numerical indexes starting at 0 with step=1, last index = 21596
df_housing.index

In [None]:
# check data types in data frame
#df_housing.dtypes

In [None]:
df_housing.describe()

Observations:
1. There is an outlier 33 in the bedrooms column.
1. There might be an outlier 8 in the bathrooms column.
1. What is the grading system?
1. what does 'view' mean?
1. How is the condition rated?
2. Missing values in the waterfront, view, yr_renovated columns
1. sqft_basement has non-numerical entries (e.g. ?)
5. The date column is in string format and needs to be changed to datetime
8. df.describe is not of much help as it doesn't show columns with missing data (NaN)
1. column names are already in Snake Case, no renaming necessary
1. What are half floors?
1. What are .25 bathrooms?

In [None]:
#look at outlier bedrooms
df_housing.query("bedrooms==33")
#conclusion: does not match size of house, faulty data, line should be removed

In [None]:
df_housing.query("bedrooms==33").index

In [None]:
#remove line with 33 bedrooms
df_housing.drop(df_housing.query("bedrooms==33").index, inplace=True)
# reset index inplace
df_housing.reset_index(inplace=True, drop=True)

In [None]:
df_housing.shape

In [None]:
df_housing.describe()

In [None]:
df_housing.query("bedrooms==7 and bathrooms==1")
# bedrooms fit sqft_living, data remains

In [None]:
#look at outlier bedrooms
df_housing.query("bedrooms>=7").describe()
# nothing suspicious, data remains. 
# House with 7 bedrooms, 1 bathroom: fits sqft_living, data remains

In [None]:
#look at outlier bathrooms
df_housing.query("bathrooms==8")
#conclusion: matches large number of bedrooms and very high price, data should remain in the data set

In [None]:
#look at outlier sqft_living
df_housing.query("sqft_living==13540")
#conclusion: matches large number of bedrooms, bathrooms and very high price, data should remain in the data set

In [None]:
df_housing.describe()

In [None]:
# change "date" dtype to datetime with format %Y/%m/%d
df_housing['date'] = pd.to_datetime(df_housing['date'], format='%m/%d/%Y')

In [None]:
# How to display only the year???
# change "yr_built" dtype to datetime with format %Y
# df_housing['yr_built'] = pd.to_datetime(df_housing['yr_built'], format='%Y')

In [None]:
# how to convert yr_renovated? wrong format 0, NaN
# change "yr_renovated" dtype to datetime with format %Y
# not relevant for Larry!
#df_housing['yr_renovated'] = pd.to_datetime(df_housing['yr_renovated'], format='%Y')

In [None]:
# Take a new look
df_housing.head()

In [None]:
# # sqft_basement contains string objects, we want floats.
# # try to convert it to get error message:
# # not relevant for Larry
# df_housing = df_housing.astype({'sqft_basement': float})
# # we get '?' as non convertable entry

In [None]:
#look at rows with '?' entry in sqft_basement column
df_housing.query('sqft_basement == "?"')

In [None]:
#replace the `?`-character with a numpy NaN value
df_housing['sqft_basement'] = df_housing.sqft_basement.replace('?',np.NaN)
# change data type to float
df_housing = df_housing.astype({'sqft_basement': float})
df_housing.sqft_basement.dtypes

In [None]:
df_housing.info()

In [None]:
# looking at missing values
# display number of missing values per column
df_housing.isna().sum()
# Only waterfront is relevant for Larry's wishes. 
# It is reasonable to assume that for the houses with NaN, 
# it is not known whether they are located at the waterfront.
# the missing values cannot be imputed.
# the other columns can be deleted, no imputation necessary.

In [None]:
print(f"numbers of rows : {df_housing.shape[0]}")
print(f"missing values in waterfront : {round(df_housing.waterfront.isna().sum()/df_housing.shape[0]*100,2)} %")
print(f"missing values in view : {round(df_housing.view.isna().sum()/df_housing.shape[0]*100,2)} %")
print(f"missing values in sqft_basement : {round(df_housing.sqft_basement.isna().sum()/df_housing.shape[0]*100,2)} %")
print(f"missing values in yr_renovated : {round(df_housing.yr_renovated.isna().sum()/df_housing.shape[0]*100,2)} %")

print(f"missing values in data frame : {round(df_housing.isna().sum().sum()/(df_housing.shape[0]*df_housing.shape[1])*100,2)} %")
# .sum() twice in last row: first gives a series of the number of nan-values per column, 
# the second sums these up

In [None]:
# plotting percentage of missing values per column
msno.bar(df_housing)

In [None]:
msno.matrix(df_housing)

In [None]:
df_housing.sort_values('date')

## categorical data:

- id
- date
- bedrooms
- bathrooms
- sqft_living
- sqft_lot
- floors
- waterfront
- view
- condition
- grade
- yr_built
- yr_renovated
- zipcode

## continuous data:

- price
- sqft_above
- sqft_basement
- lat
- long
- sqft_living15
- sqft_lot15

## Research Questions and Hypothesis Generation

### Questions with belonging hypotheses and their indicators:

* Does the location of a house affect the price?
    1. The closer a house is to the city center, the higher the price (geolocation)
    1. If a house is located close to water, then the price is higher (waterfront(yes/no)) 
    1. If the house is located in a "good" zip code, the price is higher (zipcode)

* Does the size of a house affect the price?
    1. The more bedrooms a house has, the higher the price (bedrooms)
    1. The higher the square footage of the house, the higher the price (sqft_liviing)
    1. The more bathrooms the house has, the higher the price
    1. The more floors the house has, the higher the price (floors)
    1. If the house has a basement, the price is higher (sqft_basement)
    
* Does the interest in a house affect the price?
    1. The higher the number of views, the higher the price (view)

* Does the state the house is in affect the price?
    1. The better the overall condition of the house, the higher the price (condition)
    1. The better the grade, the higher the price (grade)
    1. The newer the renovation status, the higher the price (yr_renovated)

### Questions relevant for Client

Client: Larry Sanders, Buyer.
Characteristics: Waterfront , limited budget, nice & isolated but central neighborhood without kids (but got some of his own, just doesn't want his kids to play with other kids .. because of germs)

* How to parametrize "nice & isolated but central"? Assumptions:
    1. research zip codes, which are close to center? Population density?
    1. condition could be an indicator for a "nice" house
    1. Assumption: Isolated: Refers to sqft_lot size, some e.g. lawn surrounding the house? and sqft_lot15

* Assumption: Has several kids, hence needs a house with at least 2 bedrooms. 

* Are there houses that meet all the criteria?
    1. Are there limited budget houses at the waterfront?
    1. Are there isolated but central houses?


* Room for kids: Does the size of a house affect the price?
    1. The more bedrooms a house has, the higher the price (bedrooms)

* Nice house: Does the state the house is in affect the price?
    1. The better the overall condition of the house, the higher the price (condition)
    1. The better the grade, the higher the price (grade)
    1. The newer the renovation status, the higher the price (yr_renovated)

* Waterfront, central: Does the location of a house affect the price?
    1. The closer a house is to the city center, the higher the price (geolocation, zipcode)
    1. If a house is located at the waterfront, then the price is higher (waterfront(yes/no)) 

* Does the size of the lot affect the price?
    1. The larger the lot, the higher the price. (sqft_lot; Assumption: Isolated: Refers to sqft_lot size, some e.g. lawn surrounding the house?)

* Isolated: Look at sqft_lot and sqft_lot15? Starting where isolated?
* No kids: Maybe school info per zip code?
* Limited budget: Are there houses that meet all the criteria? Assumption: Below or up to median or some other percentile?

* Grade: Classification by construction quality which refers to the types of materials used and the quality of workmanship. Buildings of better quality (higher grade) cost more to build per unit of measure and command higher value. (See Glossary in Area Report for Residential Building Grades in use by the King County Department of Assessments.)
* view An index from 0 to 4 of how good the view of the property was
condition Condition of the house, ranked from 1 to 5

In [None]:
# how are the houses with(out) waterfront distributed?
df_housing.waterfront.hist();

In [None]:
counts_waterfront = df_housing.waterfront.value_counts(dropna=False)
counts_waterfront.plot.bar(title='Waterfront Yes/No')
#plt.hist(df_housing.waterfront.replace(np.nan, "unknown"))
plt.show()

In [None]:
df_waterfront = pd.DataFrame(counts_waterfront).reset_index()
df_waterfront.columns = ['waterfront', 'count']
df_waterfront.waterfront = ['no', 'unknown', 'yes']
df_waterfront

In [None]:
sns.barplot(data=df_waterfront, x='waterfront', y='count')


In [None]:
#number of waterfront houses compared to total:
waterfront_houses = (df_housing.waterfront.values == 1).sum()
total_houses = df_housing.shape[0]
print(f"There are {waterfront_houses} waterfront houses and {total_houses} houses in total. Hence, only {waterfront_houses/total_houses * 100} % of the houses are waterfront houses. This wish highly restricts the available houses.")

In [None]:
cor_price_water = df_housing.price.corr(df_housing.waterfront)
cor_price_water

In [None]:
# Hypothesis: The more bedrooms a house has, the higher the price (bedrooms)
sns.scatterplot(data=df_housing, x='bedrooms', y='price', hue="waterfront")
plt.xlabel('Number of Bedrooms')
plt.ylabel('Price')
plt.title('Bedrooms vs. Price');

In [None]:
df_housing.groupby("waterfront", dropna=False).count()
#df_housing.query("waterfront == np.NaN")

In [None]:
sns.scatterplot(data=df_housing, x='sqft_living', y='price', hue="waterfront")
plt.xlabel('sqft Living')
plt.ylabel('Price')
plt.title('sqft Living vs. Price');
sns.lmplot(data=df_housing, x='sqft_living', y='price', hue="waterfront")
plt.xlabel('sqft Living')
plt.ylabel('Price')
plt.title('sqft Living vs. Price');

In [None]:
df_housing.groupby("waterfront")["price", "bedrooms"].describe()
#bedroom data comparable at/away from waterfront
#price is higher when at waterfront

In [None]:
#pd.plotting.scatter_matrix(df_housing[["price", "waterfront", "condition", "grade", "bedrooms", "sqft_lot"]]);
sns.pairplot(df_housing[["price", "waterfront", "condition", "grade", "bedrooms", "sqft_lot", "sqft_lot15"]], dropna=False)

In [None]:
#add column price per sqft_living
df_housing["price_per_sqft_living"] = df_housing.price / df_housing.sqft_living

In [None]:
df_housing.corr().style.bar(align='zero',color=["orange"])

In [None]:
# Hypothesis: The better the overall condition /the higher the grade of the house, the higher the price
sns.scatterplot(data=df_housing, x='condition', y='price', hue="grade")
plt.show()
sns.scatterplot(data=df_housing, x='grade', y='price', hue="condition")
plt.show()
sns.lmplot(data=df_housing, x='grade', y='price')#, hue="condition");

In [None]:
# Hypothesis: The better the overall condition /the higher the grade of the house, the higher the price
sns.scatterplot(data=df_housing, x='condition', y='price_per_sqft_living', hue="grade")
plt.show()
sns.scatterplot(data=df_housing, x='grade', y='price_per_sqft_living', hue="condition")
plt.show()
sns.lmplot(data=df_housing, x='grade', y='price_per_sqft_living')#, hue="condition");

In [None]:
df_housing.query("sqft_living > sqft_lot")#.value_counts()

## Answers to Hypotheses

1. the price is higher for houses at the waterfront. Yes.
1. the more bedrooms, the higher the price: true for waterfront housing. Only true up to and including 5 bedrooms in general
1. The better the overall condition /the higher the grade of the house, the higher the price. True for price/grade: correlation of 0.667951. not true for price/condition: correlation of 0.036056
1.

## Assumptions for Larry:

* Condition: An index from 1 to 5 on the condition of the apartment (overall). 
Seems to be no correlation, but for Larry we can decide on only looking at Condition 3 and up
since he wants a nice house

* Grade: An index from 1 to 13, 
where 1-3 falls short of building construction and design, 
7 has an average level of construction and design, 
and 11-13 have a high quality level of construction and design.
Highly correlated to price. For Larry, we will only consider grade 7 and up housing since he wants a nice house

* Isolated housing


In [None]:
#dropping columns not of relevance to Larry's needs
df_Larry = df_housing.drop(['bathrooms', 'floors', 'view', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'], axis = 1)
df_Larry

In [None]:
# get schools per zip code
%store -r dict_school_zip
print(dict_school_zip)
# get population density rank of zipcodes
%store -r dict_pop_rk
print(dict_pop_rk)
%store -r df_pop_density_short

In [None]:
pd.Series(dict_school_zip.values()).value_counts().describe()

In [None]:
np.unique(np.array(sorted(dict_school_zip.values())))


In [None]:
sorted(dict_school_zip.values())

In [None]:
df_Larry['density_rank']= df_Larry.zipcode.map(dict_pop_rk)
df_Larry['schools_per_zip'] = df_Larry.zipcode.map(dict_school_zip)
df_Larry

In [None]:
sns.scatterplot(data=df_Larry, x='density_rank', y='price', hue='schools_per_zip');

In [None]:
sns.lineplot(data=df_Larry, x='density_rank', y='schools_per_zip');

In [None]:
df_Larry.corr().style.bar(align='zero',color=["orange"])

## A Function for Larry
points for:

* waterfront yes
* limited budget: price not larger than median (not mean since too highly influenced by extreme values), i.e. 450000 $
* nice: 
    - Condition: An index from 1 to 5 on the condition of the apartment (overall). Seems to be no correlation, but for Larry we can decide on only looking at Condition 3 and up since he wants a nice house.
    - Grade: An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design. Highly correlated to price. For Larry, we will only consider grade 7 and up housing since he wants a nice house.
    - to weigh every wish equally, both measure 0.5
* isolated: 
    - sqft_lot above median so that house has above average distance to neighbor, i.e. 7619 sqft
    - sqft_lot15 above median so that 15 nearest neighbors also have above average distance, i.e. 7620 sqft
    - to weigh every wish equally, both measure 0.5
* central: when in top 15 (or max_rank) population density zip codes, i.e. if density_rank <= 15
* room for kids: bedrooms >= 2
* neighborhood without kids: no more than 2 (1st quartile) schools in the zip code

In [None]:
# collect numbers needed for function
price_limit = df_housing.price.median()
price_limit

In [None]:
lot_min = df_housing.sqft_lot.median()
lot_min

In [None]:
lot15_min = df_housing.sqft_lot15.median()
lot15_min

In [None]:
#list of top 15 population density rank zip codes
# truncate DataFrame after top 15
df_pop_density_short[:15]
# add zip entries to list
ls_central = df_pop_density_short[:15].zip.tolist()
ls_central

In [None]:
def larrys_wishes(waterfront, price, condition, grade, sqft_lot, sqft_lot15, density_rank, bedrooms, schools_per_zip, price_limit=450000, lot_min=7619, lot15_min=7620.0, max_rank=15, max_schools=2):
    points = 0
    if waterfront == 1:
        points += 1
    if price <= price_limit:
        points += 1
    if condition >= 3:
        points += 0.5
    if grade >=7:
        points += 0.5
    if sqft_lot > lot_min:
        points += 0.5
    if sqft_lot15 > lot15_min:
        points += 0.5
    if density_rank <= max_rank:
        points += 1
    if bedrooms >= 2:
        points += 1
    if schools_per_zip <= max_schools:
        points += 1
    return points

In [None]:
df_Larry["wish_points"] = df_Larry.apply(lambda row: larrys_wishes(row.waterfront, row.price, row.condition, row.grade, row.sqft_lot, row.sqft_lot15, row.density_rank, row.bedrooms, row.schools_per_zip), axis=1)

In [None]:
df_Larry.head()

In [None]:
df_Larry.describe()['wish_points']

In [None]:
# df_Larry.sort_values(['wish_points'], ascending=False).head()

In [None]:
(
df_Larry.groupby("zipcode")
    .mean()
    .sort_values(['wish_points'], ascending=False)
)

The maximum value of points attained by houses sold is 5. A house fulfilling all of Larry's wishes should have 7 points. This means that not a single house meeting all of Larry's criteria has been sold.

Hence, we will relax the conditions in the function in order to get a more realistic picture:
points for:

* waterfront yes
* limited budget: price not larger than median (not mean since too highly influenced by extreme values), i.e. 450000 $
* nice: 
    - Condition: An index from 1 to 5 on the condition of the apartment (overall). Seems to be no correlation, but for Larry we can decide on only looking at Condition 2 and up since he wants a nice house.
    - Grade: An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design. Highly correlated to price. For Larry, we will only consider grade 6 and up housing since he wants a nice house.
    - to weigh every wish equally, both measure 0.5
* isolated: 
    - sqft_lot above 40th percentile so that house has sufficient distance to neighbor, i.e. >= 6745 sqft
    - sqft_lot15 above 40th percentile so that 15 nearest neighbors also have sufficient distance, i.e. >= 6750 sqft
    - to weigh every wish equally, both measure 0.5
* central: when in top 20 (or max_rank) population density zip codes, i.e. if density_rank <= 20
* room for kids: bedrooms >= 2
* neighborhood without kids: no more than 4 (median) schools in the zip code

In [None]:
#get value of 40th percentile of lot size
df_Larry[['sqft_lot', 'sqft_lot15']].quantile(.4)

In [None]:
def larrys_wishes_rel(waterfront, price, condition, grade, sqft_lot, sqft_lot15, density_rank, bedrooms, schools_per_zip, price_limit=450000, lot_min=6745, lot15_min=6750, max_rank=20, max_schools=4):
    points = 0
    if waterfront == 1:
        points += 1
    if price <= price_limit:
        points += 1
    if condition >= 2:
        points += 0.5
    if grade >=6:
        points += 0.5
    if sqft_lot > lot_min:
        points += 0.5
    if sqft_lot15 > lot15_min:
        points += 0.5
    if density_rank <= max_rank:
        points += 1
    if bedrooms >= 2:
        points += 1
    if schools_per_zip <= max_schools:
        points += 1
    return points

In [None]:
df_Larry["wish_points_rel"] = df_Larry.apply(lambda row: larrys_wishes_rel(row.waterfront, row.price, row.condition, row.grade, row.sqft_lot, row.sqft_lot15, row.density_rank, row.bedrooms, row.schools_per_zip), axis=1)

In [None]:
df_Larry.describe()['wish_points_rel']

I talk to Larry because none of the sold houses meet his needs, not even the relaxed needs. 
Since the price restriction is non negotiable (there just is no more money...) this restriction is kept as is.

We decide to omit the requirements for "neighborhood without kids" and to relax the requirements for the lot size of the house and those of of the neighbors since he would still have a sufficiently large lot himself and does not have to let anyone in. Values changed to 30th percentile: sqft_lot 5612.000, sqft_lot15 5625.500

We furthermore decide to look somewhat less central since Larry is not too fond of other people anyways. (maximal population density rank raised to 25)

Grade and condition are lowered to 5 and 2, resp.

We remove the waterfront condition since this is simply too big a restriction.
The new maximum value for his wish function is, hence, 5.

In [None]:
#get value of 40th percentile of lot size
df_Larry[['sqft_lot', 'sqft_lot15']].quantile(.3)

In [None]:
def larrys_wishes_rel2(waterfront, price, condition, grade, sqft_lot, sqft_lot15, density_rank, bedrooms, schools_per_zip, price_limit=450000, lot_min=5612.000, lot15_min=5625.500,max_rank=25, max_schools=4):
    points = 0
    # if waterfront == 1:
    #     points += 1
    if price <= price_limit:
        points += 1
    if condition >= 2:
        points += 0.5
    if grade >=5:
        points += 0.5
    if sqft_lot > lot_min:
        points += 0.5
    if sqft_lot15 > lot15_min:
        points += 0.5
    if density_rank <= max_rank:
        points += 1
    if bedrooms >= 2:
        points += 1
    if schools_per_zip <= max_schools:
       points += 1
    return points

In [None]:
df_Larry["wish_points_rel2"] = df_Larry.apply(lambda row: larrys_wishes_rel2(row.waterfront, row.price, row.condition, row.grade, row.sqft_lot, row.sqft_lot15, row.density_rank, row.bedrooms, row.schools_per_zip), axis=1)

In [None]:
df_Larry.describe()['wish_points_rel2']

In [None]:
df_Larry.query("wish_points_rel2 == 6").groupby("zipcode").count()
#all 26 houses with the maximal possible number of 6 wish points lie in zip code 98136

In [None]:
df_Larry.wish_points_rel2.hist()

In [None]:
# How likely is the average house put on the market to fulfill Larry's wishes?
# we look at the average points per zip code. 
# The top ten thereof are our recommendation as to where to look for houses.
df_Larry_rec = pd.DataFrame(
df_Larry.groupby("zipcode")
    .mean()
    .sort_values(['wish_points_rel2'], ascending=False)['wish_points_rel2']#[:11]
)
df_Larry_rec.reset_index(inplace=True)
df_Larry_rec.head(10)#.wish_points_rel2

In [None]:
df_Larry.query("wish_points_rel2>=5").value_counts("zipcode")
# cannot base our recommendation on this count since absolute values, strongly influenced by total number of houses in those zip codes

In [None]:
# import plotly.graph_objects as go

# #df = pd.read_csv("data_group_work/airports.csv")

# #namelist = [f'IATA: {df["iata"][x]}<br>Name: {df["name"][x]}<br>State: {df["state"][x]}' for x in range(len(df))] # We need this later for the 'text'-argument in go.Scattergeo() to make the labels look nicer.

# fig = go.Figure(

# go.Scattergeo(
#         locationmode = 'USA-states', 
#         lon = df_housing['long'],
#         lat = df_housing['lat'],
# #        text = namelist,
#         mode = 'markers',
#         marker = dict( # controls the points
#             size = 2,
#             color = 'red',
#             opacity = 1
#         )
#     ))

#fig.add_trace() add county/zip bordrs as trace

# fig.update_layout(
#          title_text = 'Houses sold',
#          showlegend = False,
#          margin={"r":0,"t":50,"l":0,"b":0},
#          geo = dict(
#              scope = 'usa',
#              landcolor = 'rgb(217, 217, 217)'
#          )
#      )

# fig.update_geos(fitbounds='locations')

# fig.show()

In [None]:
df_housing.groupby('zipcode').count().reset_index().sort_values('id', ascending=False)

In [None]:
import plotly.express as px
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wa_washington_zip_codes_geo.min.json') as response:
#with urlopen('http://data-seattlecitygis.opendata.arcgis.com/datasets/83fc2e72903343aabff6de8cb445b81c_2.geojson') as response:
# data from https://catalog.data.gov/dataset/zip-codes-2259a
# data/Zip_Codes.geojson
#with urlopen('https://data-seattlecitygis.opendata.arcgis.com/datasets/SeattleCityGIS::zip-codes.geojson?outSR=%7B%22latestWkid%22%3A2926%2C%22wkid%22%3A2926%7D') as response:
    zipcodes = json.load(response)
fig = px.choropleth(df_housing.groupby('zipcode').count().reset_index(),
                    geojson=zipcodes, 
                    locations='zipcode', 
                    color='id',
                    color_continuous_scale="bluyl",#"Viridis_r",
#                    range_color=(3.5,5),
                    featureidkey="properties.ZCTA5CE10",
                    scope="usa",
                    labels={'id':'Number of Houses Sold'}
                          )

fig.update_layout(
    title = dict(text='Houses Sold', y=0.9, yanchor='top'),
    margin={"r":50,"t":50,"l":50,"b":50}
    )
fig.update_geos(fitbounds='locations')
fig.show()

In [None]:
import plotly.express as px
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wa_washington_zip_codes_geo.min.json') as response:
#with urlopen('http://data-seattlecitygis.opendata.arcgis.com/datasets/83fc2e72903343aabff6de8cb445b81c_2.geojson') as response:
# data from https://catalog.data.gov/dataset/zip-codes-2259a
# data/Zip_Codes.geojson
#with urlopen('https://data-seattlecitygis.opendata.arcgis.com/datasets/SeattleCityGIS::zip-codes.geojson?outSR=%7B%22latestWkid%22%3A2926%2C%22wkid%22%3A2926%7D') as response:
    zipcodes = json.load(response)
fig = px.choropleth(df_Larry_rec,
                    geojson=zipcodes, 
                    locations='zipcode', 
                    color='wish_points_rel2',
                    color_continuous_scale="bluyl",#"Viridis_r",
                    range_color=(3.5,5),
                    featureidkey="properties.ZCTA5CE10",
                    scope="usa",
                    labels={'wish_points_rel2':'Wish Points'}
                          )

fig.update_layout(
    title = dict(text='Recommended Zipcodes for Larry', y=0.9, yanchor='top'),
    margin={"r":50,"t":50,"l":50,"b":50}
    )
fig.update_geos(fitbounds='locations')
fig.show()