## Library imports
** **

In [23]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Data Import and Setup
** **

In [3]:
orig_df = pd.read_csv('data/kc_house_data.csv')
house_df = orig_df.copy(deep=True)

In [4]:
house_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.51,-122.26,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.72,-122.32,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.74,-122.23,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.52,-122.39,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.62,-122.05,1800,7503


In [5]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [6]:
house_df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580301520.86,540088.14,3.37,2.11,2079.9,15106.97,1.49,0.01,0.23,3.41,7.66,1788.39,291.51,1971.01,84.4,98077.94,47.56,-122.21,1986.55,12768.46
std,2876565571.31,367127.2,0.93,0.77,918.44,41420.51,0.54,0.09,0.77,0.65,1.18,828.09,442.58,29.37,401.68,53.51,0.14,0.14,685.39,27304.18
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.16,-122.52,399.0,651.0
25%,2123049194.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.47,-122.33,1490.0,5100.0
50%,3904930410.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.57,-122.23,1840.0,7620.0
75%,7308900445.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.68,-122.12,2360.0,10083.0
max,9900000190.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.78,-121.31,6210.0,871200.0


#### Date reformat type

In [7]:
house_df.date = pd.to_datetime(house_df.date)

In [8]:
house_df.date[0]

Timestamp('2014-10-13 00:00:00')

## Data Analysis
** **

Which houses should be bought and for what price?

In [9]:
zipcode_median = house_df[['price', 'zipcode']].groupby('zipcode').median().reset_index()
zipcode_median.columns = ['zipcode','median_price']

In [10]:
house_df = pd.merge(house_df,zipcode_median,on='zipcode',how='inner')

In [34]:
house_df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,median_price,status
0,7129300520,2014-10-13,221900.00,3,1.00,1180,5650,1.00,0,0,...,0,1955,0,98178,47.51,-122.26,1340,5650,278277.00,buy
1,4060000240,2014-06-23,205425.00,2,1.00,880,6780,1.00,0,0,...,0,1945,0,98178,47.50,-122.25,1190,6780,278277.00,buy
2,4058801670,2014-07-17,445000.00,3,2.25,2100,8201,1.00,0,2,...,480,1967,0,98178,47.51,-122.24,2660,8712,278277.00,do not buy
3,2976800796,2014-09-25,236000.00,3,1.00,1300,5898,1.00,0,0,...,0,1961,0,98178,47.51,-122.25,1320,7619,278277.00,buy
4,6874200960,2015-02-27,170000.00,2,1.00,860,5265,1.00,0,0,...,0,1931,0,98178,47.50,-122.27,1650,8775,278277.00,buy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,2525049086,2014-10-03,2720000.00,4,3.25,3990,18115,2.00,0,0,...,0,1989,0,98039,47.62,-122.23,3450,16087,1892500.00,do not buy
21609,2525049113,2014-07-25,1950000.00,4,3.50,4065,18713,2.00,0,0,...,0,1987,0,98039,47.62,-122.24,3070,18713,1892500.00,do not buy
21610,3262300485,2015-04-21,2250000.00,5,5.25,3410,8118,2.00,0,0,...,0,2006,0,98039,47.63,-122.24,3410,16236,1892500.00,do not buy
21611,6447300365,2014-11-13,2900000.00,5,4.00,5190,14600,2.00,0,1,...,0,2013,0,98039,47.61,-122.22,3840,19250,1892500.00,do not buy


In [12]:
house_df['condition'].value_counts()

3    14031
4     5679
5     1701
2      172
1       30
Name: condition, dtype: int64

#### Check Houses sold twice or more times

In [39]:
duplicated_ids = house_df['id'].apply(lambda x: True if (sum(x == house_df['id']) >= 2)\
                                                     else False)

Unnamed: 0,id,date,price,yr_renovated,zipcode,median_price
7,3969300030,2014-12-29,239900.00,0,98178,278277.00
191,7961500010,2015-03-04,520000.00,0,98178,278277.00
228,1423049019,2015-03-31,220000.00,0,98178,278277.00
237,7657000540,2015-03-04,260000.00,0,98178,278277.00
273,8820903380,2015-01-02,730000.00,1990,98125,425000.00
...,...,...,...,...,...,...
21446,2892700041,2015-01-28,238000.00,0,98055,294950.00
21471,3185600040,2014-12-24,310000.00,0,98055,294950.00
21493,4202400078,2015-04-28,335000.00,0,98055,294950.00
21511,7200179,2015-04-24,175000.00,0,98055,294950.00


In [40]:
house_df.loc[duplicated_ids,['id','date','price', 'yr_renovated','zipcode', 'median_price']].drop_duplicates(subset=['id'], keep='last')

Unnamed: 0,id,date,price,yr_renovated,zipcode,median_price
7,3969300030,2014-12-29,239900.00,0,98178,278277.00
191,7961500010,2015-03-04,520000.00,0,98178,278277.00
228,1423049019,2015-03-31,220000.00,0,98178,278277.00
237,7657000540,2015-03-04,260000.00,0,98178,278277.00
273,8820903380,2015-01-02,730000.00,1990,98125,425000.00
...,...,...,...,...,...,...
21446,2892700041,2015-01-28,238000.00,0,98055,294950.00
21471,3185600040,2014-12-24,310000.00,0,98055,294950.00
21493,4202400078,2015-04-28,335000.00,0,98055,294950.00
21511,7200179,2015-04-24,175000.00,0,98055,294950.00


## Business Hypothesis
***

**1.** The houses that have a price value lower than the median and are in good conditions, can be sold for a higher price, so are good to buy.
    
    
**2.** The houses that have a price value lower than the median and are in bad conditions, cannot be sold for a higher price, so are not good to buy.

**3.** The houses that have a price value higher than the median, independently from the condition, are not good to buy and take profit.


In [13]:
house_df['status'] = house_df[['price','condition','median_price']].apply(lambda x: 'buy' if (x[0] < x[2]) & (x[1] >= 3)\
                                                                          else 'do not buy',  axis = 1)

In [14]:
house_df[['id','zipcode','price','median_price','condition','status']]

Unnamed: 0,id,zipcode,price,median_price,condition,status
0,7129300520,98178,221900.00,278277.00,3,buy
1,4060000240,98178,205425.00,278277.00,4,buy
2,4058801670,98178,445000.00,278277.00,3,do not buy
3,2976800796,98178,236000.00,278277.00,3,buy
4,6874200960,98178,170000.00,278277.00,3,buy
...,...,...,...,...,...,...
21608,2525049086,98039,2720000.00,1892500.00,4,do not buy
21609,2525049113,98039,1950000.00,1892500.00,4,do not buy
21610,3262300485,98039,2250000.00,1892500.00,3,do not buy
21611,6447300365,98039,2900000.00,1892500.00,3,do not buy
