# Library imports

** **

In [3]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns

from matplotlib import pyplot as plt
from IPython.core.display import display, HTML

In [9]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [20, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    
    sns.set()
    
    #warnings.filterwarnings("ignore")

In [10]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


** **

# Data Import and Setup

** **

### Import data
** **

In [12]:
orig_df = pd.read_csv('data/kc_house_data.csv')
house_df = orig_df.copy(deep=True)

### General Info

** **

In [21]:
house_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.51,-122.26,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.72,-122.32,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.74,-122.23,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.52,-122.39,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.62,-122.05,1800,7503


In [16]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [17]:
house_df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580301520.86,540088.14,3.37,2.11,2079.9,15106.97,1.49,0.01,0.23,3.41,7.66,1788.39,291.51,1971.01,84.4,98077.94,47.56,-122.21,1986.55,12768.46
std,2876565571.31,367127.2,0.93,0.77,918.44,41420.51,0.54,0.09,0.77,0.65,1.18,828.09,442.58,29.37,401.68,53.51,0.14,0.14,685.39,27304.18
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.16,-122.52,399.0,651.0
25%,2123049194.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.47,-122.33,1490.0,5100.0
50%,3904930410.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.57,-122.23,1840.0,7620.0
75%,7308900445.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.68,-122.12,2360.0,10083.0
max,9900000190.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.78,-121.31,6210.0,871200.0


** **

# Exploratory Data Analysis
** **

### Data Dimensionality
** **

In [31]:
print('Number of Rows:{}'.format(house_df.shape[0]))
print('Number of Columns {}'.format(house_df.shape[1]))

Number of Rows:21613
Number of Columns 21


### Attributes Name and Description
** **

**id** - its a unique identifier number for each unique house

**date** - its the date when the house were sold

**price** - its the selling price when the house was sold

**bedrooms** - number of bedrooms in the house

**bathrooms** - number of bathrooms in the house, where a fraction like 0.25 represents a bathroom sink, shower or toilet

**sqft_living** - the size of the living room in square feet

**sqft_lot** - the size of the land in square feet

**floors** - number of floors in the house

**waterfront** - if there is a waterview from the house

**view** - how many views the house has

**condition** - the house preservation condition

**grade** - a rank from 1 to 13, which ranks the construction quality

**sqft_above** - the size of the house above the ground level in square feet

**sqft_basement** - the size of the house below the ground level in square feet

**yr_built** - the year the house was initially built

**yr_renovated** - the year of the house's last renovation

**zipcode** - what zipcode area the house is in

**lat** - Lattitude

**long** - Longitude

**sqft_living15** - The square footage of interior housing living space for the nearest 15 neighbors (possibly)

**sqft_lot15** - The square footage of the land lots of the nearest 15 neighbors

** **

### Attributes Type and Data type conversions
** **

In [27]:
house_df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

#### Date reformat type

In [28]:
house_df.date = pd.to_datetime(house_df.date)

In [29]:
house_df.date[0]

Timestamp('2014-10-13 00:00:00')

### Attributes Descriptive Statistics
** **

In [33]:
num_attributes = house_df.select_dtypes(include=['int64', 'float64'])

In [74]:
mean = pd.DataFrame(num_attributes.mean())
std = pd.DataFrame(num_attributes.std())
median = pd.DataFrame(num_attributes.median())

q1,q3 = pd.DataFrame(num_attributes.quantile([0.25])), pd.DataFrame(num_attributes.quantile([0.75]))

maximum = pd.DataFrame(num_attributes.max())
minimum = pd.DataFrame(num_attributes.min())

descriptive_statistics = pd.concat([mean,std,minimum,q1.transpose(),median,q3.transpose(),maximum],axis=1)
descriptive_statistics.columns = ['mean','std','minimum','q1','median','q3','maximum']
descriptive_statistics

Unnamed: 0,mean,std,minimum,q1,median,q3,maximum
id,4580301520.86,2876565571.31,1000102.0,2123049194.0,3904930410.0,7308900445.0,9900000190.0
price,540088.14,367127.2,75000.0,321950.0,450000.0,645000.0,7700000.0
bedrooms,3.37,0.93,0.0,3.0,3.0,4.0,33.0
bathrooms,2.11,0.77,0.0,1.75,2.25,2.5,8.0
sqft_living,2079.9,918.44,290.0,1427.0,1910.0,2550.0,13540.0
sqft_lot,15106.97,41420.51,520.0,5040.0,7618.0,10688.0,1651359.0
floors,1.49,0.54,1.0,1.0,1.5,2.0,3.5
waterfront,0.01,0.09,0.0,0.0,0.0,0.0,1.0
view,0.23,0.77,0.0,0.0,0.0,0.0,4.0
condition,3.41,0.65,1.0,3.0,3.0,4.0,5.0


Which houses should be bought and for what price?

In [None]:
zipcode_median = house_df[['price', 'zipcode']].groupby('zipcode').median().reset_index()
zipcode_median.columns = ['zipcode','median_price']

In [None]:
house_df = pd.merge(house_df,zipcode_median,on='zipcode',how='inner')

In [None]:
house_df

In [None]:
house_df['condition'].value_counts()

#### Check Houses sold twice or more times

In [None]:
duplicated_ids = house_df['id'].apply(lambda x: True if (sum(x == house_df['id']) >= 2)\
                                                     else False)

In [None]:
house_df.loc[duplicated_ids,['id','date','price', 'yr_renovated','zipcode', 'median_price']].drop_duplicates(subset=['id'], keep='last')

## Business Hypothesis
***

**1.** The houses that have a price value lower than the median and are in good conditions, can be sold for a higher price, so are good to buy.
    
    
**2.** The houses that have a price value lower than the median and are in bad conditions, cannot be sold for a higher price, so are not good to buy.

**3.** The houses that have a price value higher than the median, independently from the condition, are not good to buy and take profit.


In [None]:
house_df['status'] = house_df[['price','condition','median_price']].apply(lambda x: 'buy' if (x[0] < x[2]) & (x[1] >= 3)\
                                                                          else 'do not buy',  axis = 1)

In [None]:
house_df[['id','zipcode','price','median_price','condition','status']]