In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data = pd.read_csv('kc_house_data.csv')
data

In [None]:
data.columns

### Are there missing values?

In [None]:
data.isna().any().any()

In [None]:
data[data['price'] == 0].shape[0]

In [None]:
(data == 0).any()[(data == 0).any() == True]

In [None]:
# some houses have 0 bedrooms
data[data['bedrooms'] == 0].shape[0]

In [None]:
data[data['bedrooms'] == 0]

In [None]:
# some houses have 0 bathrooms
data[data['bathrooms'] == 0].shape[0]

In [None]:
data[data['bathrooms'] == 0]

In [None]:
# remove rows with 0 bedrooms
idx_zero_bedrooms = data[data['bedrooms'] == 0].index
data = data.drop(idx_zero_bedrooms).reset_index(drop=True)

In [None]:
# remove rows with 0 bathrooms
idx_zero_bathrooms = data[data['bathrooms'] == 0].index
data = data.drop(idx_zero_bathrooms).reset_index(drop=True)

In [None]:
data.dtypes

In [None]:
# bathrooms is of float type
data['bathrooms'].dtype

In [None]:
bathrooms_values = data['bathrooms'].unique()
bathrooms_values.sort()
bathrooms_values

In the United States, there is a lack of a single definition. This commonly results in discrepancies between advertised and actual number of baths in real estate listings. Bathrooms are generally categorized as "master bathroom", containing a shower and a bathtub that is adjoining to the largest bedroom; a "full bathroom" (or "full bath"), containing four plumbing fixtures: a toilet and sink, and either a bathtub with a shower, or a bathtub and a separate shower stall; "half bath" (or "powder room") containing just a toilet and sink; and "3/4 bath" containing toilet, sink, and shower, although the terms vary from market to market. In some U.S. markets, a toilet, sink, and shower are considered a "full bath." In addition, there is the use of the word "bathroom" to describe a room containing a toilet and a basin, and nothing else.

In [None]:
def unique_values(column: str) -> np.ndarray:
    values: np.ndarray = data[column].unique()
    values.sort()
    return values

In [None]:
# floors are also of float type
unique_values('floors')

In [None]:
unique_values('waterfront')

In [None]:
# I don't know what 2 or more views means
unique_values('view')

In [None]:
unique_values('grade')


### How many houses have 1 bedroom?

In [None]:
data[data['bedrooms'] == 1].shape[0]

### How many houses have more than 1 floor?

In [None]:
data[data['floors'] > 1].shape[0]

### How many houses are in good or bad condition?

In [None]:
condition = data.groupby('condition')['condition'].count()
condition

In [None]:
plt.bar(condition.index, condition.values)

### How many rooms, on average, do the houses have?

In [None]:
data.mean()['bedrooms'].round(2)

### What's the average house price?

In [None]:
avg_price = data.mean()['price'].round()
avg_price

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.utf8')
locale.getlocale()

In [None]:
locale.currency(avg_price, grouping=True)

### What's the price standard deviation?

In [None]:
price_std_deviation = data.std()['price'].round(2)
locale.currency(price_std_deviation, grouping=True)

In [None]:
plt.hist(data['price'], bins = 1000);

### What are the Q1 and Q3 of the prices?

In [None]:
q1 = data['price'].quantile([0.25, 0.75]).iloc[0]
q3 =  data['price'].quantile([0.25, 0.75]).iloc[1]

In [None]:
# Q1
locale.currency(q1, grouping=True)

In [None]:
# Q3
locale.currency(q3, grouping=True)

### Are there pricing outliers?


#### Outliers := below $Q_1 - 1.5 \times IQR$ and above $Q_3 +  1.5 \times IQR$

In [None]:
iqr = q3 - q1
iqr

In [None]:
minimum = q1 - 1.5*iqr
minimum

In [None]:
maximum = q3 + 1.5*iqr
maximum

In [None]:
# no data below minimum, since prices can't be negative
data[data['price'] < minimum].shape[0]

In [None]:
data[data['price'] > minimum].shape[0]

In [None]:
# histogram of prices without outliers
plt.hist(data[data['price'] < maximum]['price'], bins = 100);