## Preparing the Data
<span  style="color:purple; font-size:25px">
Loading the Data
</span>

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
building = pd.read_csv('Datasets/building.csv')
building.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3957 entries, 0 to 3956
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   strap                   3957 non-null   object 
 1   bld_num                 3957 non-null   int64  
 2   effective_year_built    3957 non-null   int64  
 3   design_code             3957 non-null   int64  
 4   design                  3957 non-null   object 
 5   quality                 3957 non-null   object 
 6   quality_code            3957 non-null   int64  
 7   bldg_class_code         3957 non-null   int64  
 8   bldg_class              3957 non-null   object 
 9   construction_type_code  3743 non-null   float64
 10  construction_type       3736 non-null   object 
 11  nbr_bed_room            3957 non-null   float64
 12  nbr_full_baths          3957 non-null   float64
 13  nbr_three_qtr_baths     3957 non-null   float64
 14  nbr_half_baths          3957 non-null   

In [3]:
land = pd.read_csv('Datasets/land.csv')
land.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3433 entries, 0 to 3432
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   strap            3433 non-null   object 
 1   land_class       3433 non-null   int64  
 2   land_class_dscr  3433 non-null   object 
 3   sqft             3433 non-null   int64  
 4   acreage          3433 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 134.2+ KB


In [4]:
property = pd.read_csv('Datasets/property.csv')
property.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3431 entries, 0 to 3430
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   strap           3431 non-null   object 
 1   market_area     3431 non-null   int64  
 2   address         3431 non-null   object 
 3   unincorporated  3431 non-null   bool   
 4   sub_code        3431 non-null   int64  
 5   sub_dscr        3431 non-null   object 
 6   section         3431 non-null   int64  
 7   township        3431 non-null   object 
 8   range           3431 non-null   int64  
 9   mill_levy       3431 non-null   float64
 10  folio           3431 non-null   object 
dtypes: bool(1), float64(1), int64(4), object(5)
memory usage: 271.5+ KB


In [5]:
sales = pd.read_csv('Datasets/sales.csv')
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9598 entries, 0 to 9597
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   strap             9598 non-null   object
 1   transaction_date  9598 non-null   object
 2   sales_cd          9598 non-null   object
 3   sales_cd_dscr     9598 non-null   object
 4   price             9598 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 375.0+ KB


In [6]:
time_trend_adj = pd.read_csv('Datasets/time_trend_adjustments.csv')
time_trend_adj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       60 non-null     int64  
 1   year             60 non-null     int64  
 2   month            60 non-null     int64  
 3   market_area_101  60 non-null     float64
 4   market_area_102  60 non-null     float64
 5   market_area_103  60 non-null     float64
 6   market_area_104  60 non-null     float64
 7   market_area_105  60 non-null     float64
 8   market_area_106  60 non-null     float64
 9   market_area_107  60 non-null     float64
 10  market_area_108  60 non-null     float64
 11  market_area_109  60 non-null     float64
dtypes: float64(9), int64(3)
memory usage: 5.8 KB


In [7]:
valuations = pd.read_csv('Datasets/valuations.csv')
valuations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3431 entries, 0 to 3430
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   strap                3431 non-null   object 
 1   tax_yr               3431 non-null   int64  
 2   bld_appraised_val    3431 non-null   int64  
 3   land_appraised_val   3431 non-null   int64  
 4   total_appraised_val  3431 non-null   int64  
 5   bld_assessed_val     3429 non-null   float64
 6   land_assessed_val    0 non-null      float64
 7   total_assessed_val   3431 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 214.6+ KB


<span  style="color:purple; font-size:25px">
Tidying the Data
</span>

#### Datetimes

In [8]:
# Note that the dates in `time_trend_adj` and `sales` are not datetime data types.  Let's correct that using `pd.to_datetime`
# and assign missing day values to 1.

time_trend_adj['date'] = pd.to_datetime(time_trend_adj[['year', 'month']].assign(day = 1))

In [9]:
# Now remove the redundant columns.
time_trend_adj.drop(columns = ['year', 'month', 'Unnamed: 0'], inplace = True)
time_trend_adj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   market_area_101  60 non-null     float64       
 1   market_area_102  60 non-null     float64       
 2   market_area_103  60 non-null     float64       
 3   market_area_104  60 non-null     float64       
 4   market_area_105  60 non-null     float64       
 5   market_area_106  60 non-null     float64       
 6   market_area_107  60 non-null     float64       
 7   market_area_108  60 non-null     float64       
 8   market_area_109  60 non-null     float64       
 9   date             60 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(9)
memory usage: 4.8 KB


In [10]:
# Now do the same for the `sales` data frame.
sales['date'] = pd.to_datetime(sales['transaction_date'])
sales.drop(columns = 'transaction_date', inplace = True)
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9598 entries, 0 to 9597
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   strap          9598 non-null   object        
 1   sales_cd       9598 non-null   object        
 2   sales_cd_dscr  9598 non-null   object        
 3   price          9598 non-null   int64         
 4   date           9598 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 375.0+ KB


#### Preparing the Market Adjustments

In [11]:
# Let's look at the `time_trend_adj` data
time_trend_adj.head()

Unnamed: 0,market_area_101,market_area_102,market_area_103,market_area_104,market_area_105,market_area_106,market_area_107,market_area_108,market_area_109,date
0,1.4591,1.607,1.6384,1.5174,1.5532,1.5728,1.5938,1.2996,1.4132,2017-07-01
1,1.4485,1.5989,1.6288,1.5075,1.5481,1.5654,1.583,1.2996,1.4132,2017-08-01
2,1.4379,1.5908,1.6193,1.4977,1.543,1.5579,1.5723,1.2996,1.4132,2017-09-01
3,1.4273,1.5828,1.6099,1.488,1.5379,1.5505,1.5616,1.2996,1.3913,2017-10-01
4,1.4169,1.5748,1.6005,1.4783,1.5329,1.5431,1.551,1.2996,1.384,2017-11-01


In order to merge this data frame with `sales` to create an adjusted sales price, we need to change this so there are `market_area`
and `multiplier` columns

In [12]:
time_trend_adj_clean = time_trend_adj.melt(id_vars = 'date',
                                           var_name = 'market_area',
                                           value_name = 'multiplier')

time_trend_adj_clean

Unnamed: 0,date,market_area,multiplier
0,2017-07-01,market_area_101,1.4591
1,2017-08-01,market_area_101,1.4485
2,2017-09-01,market_area_101,1.4379
3,2017-10-01,market_area_101,1.4273
4,2017-11-01,market_area_101,1.4169
...,...,...,...
535,2022-02-01,market_area_109,1.0296
536,2022-03-01,market_area_109,1.0147
537,2022-04-01,market_area_109,1.0000
538,2022-05-01,market_area_109,1.0000


In [13]:
# Now use regular expressions to extract the market area number
time_trend_adj_clean['market_area'] = time_trend_adj_clean['market_area'].str.extract(r'(\d+)$').astype(int)
time_trend_adj_clean

Unnamed: 0,date,market_area,multiplier
0,2017-07-01,101,1.4591
1,2017-08-01,101,1.4485
2,2017-09-01,101,1.4379
3,2017-10-01,101,1.4273
4,2017-11-01,101,1.4169
...,...,...,...
535,2022-02-01,109,1.0296
536,2022-03-01,109,1.0147
537,2022-04-01,109,1.0000
538,2022-05-01,109,1.0000


#### Adjusting the Sales Prices

In [14]:
# Merge the `sales` and `property` data frames to prepare for a row-wise calculation
sales_w_market_area = sales.merge(property[['strap', 'market_area']],
                                  on = 'strap',
                                  how = 'left')
sales_w_market_area

Unnamed: 0,strap,sales_cd,sales_cd_dscr,price,date,market_area
0,R0000008,Q,qualified,65000,1978-01-03,102
1,R0000019,U,unqualified,75000,1980-08-13,109
2,R0000019,U,unqualified,110600,1985-04-08,109
3,R0000019,Q,qualified,126400,1992-04-22,109
4,R0000019,Q,qualified,332000,2000-06-02,109
...,...,...,...,...,...,...
9593,R0610553,Q,qualified,2872600,2021-07-06,102
9594,R0610553,Q,qualified,3450000,2023-02-09,102
9595,R0612718,U,unqualified,910000,2021-05-10,109
9596,R0613548,Q,qualified,850000,2020-09-14,102


In [15]:
# After attempting to merge this column with `time_trend_adj_clean` using the `date` (as datetime) and `market_area` columns,
# we ran into issues with many new untidy columns being created.

# Therefore, we will create `year` and `month` columns for each data frame and use them to merge our final two data frames

sales_w_market_area['year'] = sales_w_market_area['date'].dt.year
sales_w_market_area['month'] = sales_w_market_area['date'].dt.month
sales_w_market_area

Unnamed: 0,strap,sales_cd,sales_cd_dscr,price,date,market_area,year,month
0,R0000008,Q,qualified,65000,1978-01-03,102,1978,1
1,R0000019,U,unqualified,75000,1980-08-13,109,1980,8
2,R0000019,U,unqualified,110600,1985-04-08,109,1985,4
3,R0000019,Q,qualified,126400,1992-04-22,109,1992,4
4,R0000019,Q,qualified,332000,2000-06-02,109,2000,6
...,...,...,...,...,...,...,...,...
9593,R0610553,Q,qualified,2872600,2021-07-06,102,2021,7
9594,R0610553,Q,qualified,3450000,2023-02-09,102,2023,2
9595,R0612718,U,unqualified,910000,2021-05-10,109,2021,5
9596,R0613548,Q,qualified,850000,2020-09-14,102,2020,9


In [16]:
# Now do the same for `time_trend_adj_clean`

time_trend_adj_clean['year'] = time_trend_adj_clean['date'].dt.year
time_trend_adj_clean['month'] = time_trend_adj_clean['date'].dt.month
time_trend_adj_clean

Unnamed: 0,date,market_area,multiplier,year,month
0,2017-07-01,101,1.4591,2017,7
1,2017-08-01,101,1.4485,2017,8
2,2017-09-01,101,1.4379,2017,9
3,2017-10-01,101,1.4273,2017,10
4,2017-11-01,101,1.4169,2017,11
...,...,...,...,...,...
535,2022-02-01,109,1.0296,2022,2
536,2022-03-01,109,1.0147,2022,3
537,2022-04-01,109,1.0000,2022,4
538,2022-05-01,109,1.0000,2022,5


In [17]:
time_trend_adj_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         540 non-null    datetime64[ns]
 1   market_area  540 non-null    int32         
 2   multiplier   540 non-null    float64       
 3   year         540 non-null    int64         
 4   month        540 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2)
memory usage: 19.1 KB


In [18]:
# Now merge this with the `time_trend_adj_clean` data frame to add in the multiplier
sales_w_mult = sales_w_market_area.merge(time_trend_adj_clean[['year', 'month', 'market_area', 'multiplier']],
                                         on = ['year', 'month', 'market_area'],
                                         how = 'left').drop(columns = ['year', 'month'])
sales_w_mult

Unnamed: 0,strap,sales_cd,sales_cd_dscr,price,date,market_area,multiplier
0,R0000008,Q,qualified,65000,1978-01-03,102,
1,R0000019,U,unqualified,75000,1980-08-13,109,
2,R0000019,U,unqualified,110600,1985-04-08,109,
3,R0000019,Q,qualified,126400,1992-04-22,109,
4,R0000019,Q,qualified,332000,2000-06-02,109,
...,...,...,...,...,...,...,...
9593,R0610553,Q,qualified,2872600,2021-07-06,102,1.2167
9594,R0610553,Q,qualified,3450000,2023-02-09,102,
9595,R0612718,U,unqualified,910000,2021-05-10,109,1.1910
9596,R0613548,Q,qualified,850000,2020-09-14,102,1.3453


Now that we have all the necessary columns in one data frame, we can create a column for the adjusted sales price

In [19]:
sales_w_mult['adjusted_sales_price'] = np.where(sales_w_mult['multiplier'].isnull(), 
                                                sales_w_mult['price'],
                                                sales_w_mult['price'] * sales_w_mult['multiplier']).astype(int)
sales_w_mult

Unnamed: 0,strap,sales_cd,sales_cd_dscr,price,date,market_area,multiplier,adjusted_sales_price
0,R0000008,Q,qualified,65000,1978-01-03,102,,65000
1,R0000019,U,unqualified,75000,1980-08-13,109,,75000
2,R0000019,U,unqualified,110600,1985-04-08,109,,110600
3,R0000019,Q,qualified,126400,1992-04-22,109,,126400
4,R0000019,Q,qualified,332000,2000-06-02,109,,332000
...,...,...,...,...,...,...,...,...
9593,R0610553,Q,qualified,2872600,2021-07-06,102,1.2167,3495092
9594,R0610553,Q,qualified,3450000,2023-02-09,102,,3450000
9595,R0612718,U,unqualified,910000,2021-05-10,109,1.1910,1083810
9596,R0613548,Q,qualified,850000,2020-09-14,102,1.3453,1143505


In order to conduct time series analysis, we should set the index to the date

In [20]:
sales_adj = sales_w_mult.set_index('date')
sales_adj

Unnamed: 0_level_0,strap,sales_cd,sales_cd_dscr,price,market_area,multiplier,adjusted_sales_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1978-01-03,R0000008,Q,qualified,65000,102,,65000
1980-08-13,R0000019,U,unqualified,75000,109,,75000
1985-04-08,R0000019,U,unqualified,110600,109,,110600
1992-04-22,R0000019,Q,qualified,126400,109,,126400
2000-06-02,R0000019,Q,qualified,332000,109,,332000
...,...,...,...,...,...,...,...
2021-07-06,R0610553,Q,qualified,2872600,102,1.2167,3495092
2023-02-09,R0610553,Q,qualified,3450000,102,,3450000
2021-05-10,R0612718,U,unqualified,910000,109,1.1910,1083810
2020-09-14,R0613548,Q,qualified,850000,102,1.3453,1143505


#### Ensuring the Rest of the Data is Tidy

##### `Valuations`

In [21]:
valuations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3431 entries, 0 to 3430
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   strap                3431 non-null   object 
 1   tax_yr               3431 non-null   int64  
 2   bld_appraised_val    3431 non-null   int64  
 3   land_appraised_val   3431 non-null   int64  
 4   total_appraised_val  3431 non-null   int64  
 5   bld_assessed_val     3429 non-null   float64
 6   land_assessed_val    0 non-null      float64
 7   total_assessed_val   3431 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 214.6+ KB


`land_assessed_val` has no non-null values, but we know that `total_assessed_val` is the sum of it and `bld_assessed_val`, so we can fill in the NA values with their proper amounts.  There are two buildings that have a non-null `total_assessed_val`, but a null `bld_assessed_val`.  Since we still have the total, we will still keep these records.

In [22]:
valuations['land_assessed_val'] = valuations['total_assessed_val'] - valuations['bld_assessed_val']
valuations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3431 entries, 0 to 3430
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   strap                3431 non-null   object 
 1   tax_yr               3431 non-null   int64  
 2   bld_appraised_val    3431 non-null   int64  
 3   land_appraised_val   3431 non-null   int64  
 4   total_appraised_val  3431 non-null   int64  
 5   bld_assessed_val     3429 non-null   float64
 6   land_assessed_val    3429 non-null   float64
 7   total_assessed_val   3431 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 214.6+ KB


In [23]:
# Our code appears to have worked, but let's look at a few examples to be sure
valuations.sample(5)

Unnamed: 0,strap,tax_yr,bld_appraised_val,land_appraised_val,total_appraised_val,bld_assessed_val,land_assessed_val,total_assessed_val
827,R0002794,2024,539000,756700,1295700,36113.0,47014.0,83127
2301,R0007454,2024,1077900,1159800,2237700,72219.0,74022.0,146241
393,R0001238,2024,106570,959130,1065700,7140.0,60577.0,67717
1952,R0006366,2024,155200,939100,1094300,10398.0,59235.0,69633
2137,R0006937,2024,1319600,1071200,2390800,88413.0,68086.0,156499


##### `Property`

In [24]:
property.sample(10)

Unnamed: 0,strap,market_area,address,unincorporated,sub_code,sub_dscr,section,township,range,mill_levy,folio
2936,R0035022,107,"200 GREEN ROCK DR, BOULDER CO",False,4113,KNOLLWOOD 1 - BOV,25,1N,71,104.359,146125308006
459,R0001478,102,"905 LINCOLN PL, BOULDER CO",False,8343,UNIVERSITY PLACE - BO,31,1N,70,86.359,146331311011
187,R0000594,102,"931 LINCOLN PL, BOULDER CO",False,8343,UNIVERSITY PLACE - BO,31,1N,70,86.359,146331311013
448,R0001442,105,"632 UNIVERSITY AVE, BOULDER CO",False,9921,"TR, NBR 158, 160, 162",36,1N,71,86.359,146136100041
2326,R0007513,105,"1004 PLEASANT ST, BOULDER CO",False,1187,CAPITOL HILL - BO,31,1N,70,86.359,146331237001
3318,R0120855,103,"888 7TH ST, BOULDER CO",False,6701,PRENTUPS CORNER - BO,36,1N,71,86.359,146136443002
287,R0000889,102,"770 12TH ST, BOULDER CO",False,8343,UNIVERSITY PLACE - BO,31,1N,70,86.359,146331331002
264,R0000835,101,"2205 WALNUT ST, BOULDER CO",False,5308,MINI-PARK PLACE SUB - BO,30,1N,70,86.359,146330456001
2243,R0007269,103,"1036 8TH ST, BOULDER CO",False,6981,ROSE HILL - BO,36,1N,71,86.359,146136416012
2497,R0008068,102,"811 16TH ST, BOULDER CO",False,8343,UNIVERSITY PLACE - BO,31,1N,70,86.359,146331319013


The `property` data set appears to be tidy

##### `Land`

In [25]:
land.sample(10)

Unnamed: 0,strap,land_class,land_class_dscr,sqft,acreage
1196,R0003962,1112,SINGLE FAM.RES.-LAND,9892,0.227
1193,R0003937,1115,DUP/TRIPLEXES-LAND,5705,0.131
543,R0001819,1112,SINGLE FAM.RES.-LAND,6315,0.145
157,R0000510,1112,SINGLE FAM.RES.-LAND,10207,0.234
860,R0002896,1112,SINGLE FAM.RES.-LAND,2240,0.051
1153,R0003815,1112,SINGLE FAM.RES.-LAND,7280,0.167
642,R0002114,1115,DUP/TRIPLEXES-LAND,6831,0.157
1296,R0004268,1112,SINGLE FAM.RES.-LAND,7732,0.178
2534,R0008166,1112,SINGLE FAM.RES.-LAND,6323,0.145
3121,R0085109,1112,SINGLE FAM.RES.-LAND,9107,0.209


In [26]:
land['land_class_dscr'].value_counts()

SINGLE FAM.RES.-LAND    2927
DUP/TRIPLEXES-LAND       503
MANUF HOME PARK LAND       2
VACANT RES LOTS            1
Name: land_class_dscr, dtype: int64

The `land` data set appears to be tidy

##### `Building`

In [27]:
building.sample(10)

Unnamed: 0,strap,bld_num,effective_year_built,design_code,design,quality,quality_code,bldg_class_code,bldg_class,construction_type_code,...,mainfloor_sqft,bsmt_sqft,bsmt_type_code,bsmt_type,car_storage_type_code,car_storage_type,ext_wall_code,ext_wall,ac,heating
1238,R0003509,1,1982,10,1 Story - Ranch,GOOD,40,1212,SINGLE FAM RES IMPROVEMENTS,310.0,...,1716.0,1716.0,BWU,WALK-OUT BASEMENT UNFINISHED AREA,GRC,CARPORT AREA,10.0,Frame Wood/Shake,False,True
356,R0000909,2,2006,456,TOOL SHED,AVERAGE,30,1212,SINGLE FAM RES IMPROVEMENTS,,...,,,,,,,,,False,False
1607,R0004508,1,2002,10,1 Story - Ranch,AVERAGE ++,32,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,1442.0,,,,GRA,ATTACHED GARAGE AREA,30.0,Frame Stucco,False,True
2488,R0006887,1,2000,10,1 Story - Ranch,VERY GOOD,50,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,2959.0,1372.0,BSU,SUBTERRANEAN BASEMENT UNFINISHED AREA,GRB,BASEMENT GARAGE AREA,70.0,Brick on Block,False,True
880,R0002502,1,1985,10,1 Story - Ranch,AVERAGE,30,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,1328.0,,,,GRD,DETACHED GARAGE,70.0,Brick on Block,False,True
2208,R0006156,1,1990,20,2-3 Story,GOOD +,41,1212,SINGLE FAM RES IMPROVEMENTS,,...,860.0,956.0,BWF,WALK-OUT BASEMENT FINISHED AREA,GRA,ATTACHED GARAGE AREA,10.0,Frame Wood/Shake,False,True
1223,R0003475,1,2000,20,2-3 Story,GOOD,40,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,1085.0,558.0,BSF,SUBTERRANEAN BASEMENT FINISHED AREA,,,70.0,Brick on Block,False,True
1395,R0003909,1,1975,20,2-3 Story,AVERAGE +,31,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,1252.0,272.0,BSF,SUBTERRANEAN BASEMENT FINISHED AREA,,,40.0,Brick Veneer,True,True
1416,R0003983,1,1994,10,1 Story - Ranch,VERY GOOD,50,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,1836.0,1033.0,BWF,WALK-OUT BASEMENT FINISHED AREA,GRA,ATTACHED GARAGE AREA,100.0,Moss Rock/Flagstone,False,True
956,R0002725,2,1986,456,TOOL SHED,LOW,10,1212,SINGLE FAM RES IMPROVEMENTS,,...,,,,,,,,,False,False


Some entries are capitalized title-style and some are all uppercase. For consistancy and regex simplicity, we will make them all uppercase

In [28]:
building = building.applymap(lambda x: x.upper() if isinstance(x, str) else x)
building.sample(5)

Unnamed: 0,strap,bld_num,effective_year_built,design_code,design,quality,quality_code,bldg_class_code,bldg_class,construction_type_code,...,mainfloor_sqft,bsmt_sqft,bsmt_type_code,bsmt_type,car_storage_type_code,car_storage_type,ext_wall_code,ext_wall,ac,heating
1912,R0005311,1,1990,20,2-3 STORY,AVERAGE ++,32,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,837.0,,,,GRD,DETACHED GARAGE,70.0,BRICK ON BLOCK,False,True
2242,R0006235,1,1970,20,2-3 STORY,AVERAGE,30,1212,SINGLE FAM RES IMPROVEMENTS,310.0,...,897.0,,,,,,10.0,FRAME WOOD/SHAKE,False,True
2363,R0006569,1,1970,20,2-3 STORY,AVERAGE ++,32,1212,SINGLE FAM RES IMPROVEMENTS,310.0,...,960.0,512.0,BSU,SUBTERRANEAN BASEMENT UNFINISHED AREA,GRA,ATTACHED GARAGE AREA,10.0,FRAME WOOD/SHAKE,False,True
154,R0000434,2,1996,456,TOOL SHED,AVERAGE,30,1215,DUP/TRIPLEX IMPROVEMENTS,310.0,...,,,,,,,,,False,False
586,R0001569,1,1970,10,1 STORY - RANCH,AVERAGE,30,1212,SINGLE FAM RES IMPROVEMENTS,320.0,...,972.0,864.0,BSF,SUBTERRANEAN BASEMENT FINISHED AREA,,,100.0,MOSS ROCK/FLAGSTONE,False,True


<span  style="color:purple; font-size:25px">
Exploring the Data
</span>

#### Adding New Features to the Data

##### Finished Basements

One of the central features that the legislators wanted us to look at was whether or not homes have finished basements, and how i could affect home value.

In [29]:
# Let's examine the different values of basement types that have been recorded
building['bsmt_type'].value_counts()

SUBTERRANEAN BASEMENT FINISHED AREA           965
SUBTERRANEAN BASEMENT UNFINISHED AREA         538
WALK-OUT BASEMENT FINISHED AREA               440
GARDEN BASEMENT FINISHED AREA                 184
LOWER LVL GARDEN FINISHED (BI-SPLIT LVL)       54
WALK-OUT BASEMENT UNFINISHED AREA              31
LOWER LVL WALKOUT FINISHED (BI-SPLIT LVL)      15
GARDEN BASEMENT UNFINISHED AREA                12
LOWER LVL GARDEN UNFINISHED (BI-SPLIT LVL)      2
Name: bsmt_type, dtype: int64

In [30]:
building['bsmt_type_code'].value_counts()

BSF    965
BSU    538
BWF    440
BGF    184
LGF     54
BWU     31
LWF     15
BGU     12
LGU      2
Name: bsmt_type_code, dtype: int64

Since basements are explicitly labeled as "finished" or "unfinished", and their codes end in "F" or "U" accordingly, we can create a boolean column to quickly show whether each building has a finished basement.

In [31]:
building['bsmt_finished'] = np.where(building['bsmt_type_code'].str.contains(r'U$', na = False) | building['bsmt_type_code'].isna(),
                                     False,
                                     True)

In [32]:
# Verify that it this column was properly created by running this code multiple times and verifying
building[['bsmt_type_code', 'bsmt_type', 'bsmt_finished']].sample(10)

Unnamed: 0,bsmt_type_code,bsmt_type,bsmt_finished
688,BSF,SUBTERRANEAN BASEMENT FINISHED AREA,True
3729,,,False
246,,,False
1922,BSU,SUBTERRANEAN BASEMENT UNFINISHED AREA,False
845,,,False
628,,,False
2857,BSF,SUBTERRANEAN BASEMENT FINISHED AREA,True
3426,,,False
1891,BSU,SUBTERRANEAN BASEMENT UNFINISHED AREA,False
1604,,,False


##### Number of Houses on the Property

Another factor in determining the value of a home is how many buildings are on the property.  We will create a column to record this data.

In [57]:
# How many occurences are there of multiple buildings on a property? What is the proportion?
print(building['bld_num'].value_counts())
print(building['bld_num'].value_counts().iloc[1:4].sum())
building['bld_num'].value_counts().iloc[1:4].sum() / building['bld_num'].value_counts().sum()

1    3429
2     496
3      30
4       2
Name: bld_num, dtype: int64
528


0.133434420015163

Approximately 13.3% (528 properties) of the properties have multiple buildings, enough that to analyze this further.

In [62]:
# Create the `num_buildings` column and populate it with the count of each property (strap)
building['num_buildings'] = building.groupby('strap')['strap'].transform('count')
building.sample(10)
building['num_buildings'].value_counts()

1    2933
2     932
3      84
4       8
Name: num_buildings, dtype: int64

It should be noted that while we normally would not put aggregate values in each occurence of a property, the goal is to join this aggregate data into another data frame to perform analysis.  So, this is a preliminary step to prevent later complications

#### What Might Be Affecting Price?

In [33]:
building['design'].value_counts()

2-3 STORY                                  2139
1 STORY - RANCH                            1183
STUDIO                                      242
MULTI STORY- TOWNHOUSE                      112
TOOL SHED                                    91
SPLIT-LEVEL                                  79
GARAGE DETACHED RESIDENTIAL                  66
EQUIPMENT SHED                               13
BI-LEVEL                                     11
EQUIPMENT (SHOP) BUILDING                     8
GREENHOUSE                                    6
OTHER STRUCTURE                               2
GENERAL PURPOSE BARN                          1
STORAGE SHED (PREFABRICATED)                  1
1-STORY TWNHM                                 1
GRNHS, HOOP, ARCH-RIB, MED(4500-9000SF)       1
MODULAR                                       1
Name: design, dtype: int64

In [67]:
building.columns

Index(['strap', 'bld_num', 'effective_year_built', 'design_code', 'design',
       'quality', 'quality_code', 'bldg_class_code', 'bldg_class',
       'construction_type_code', 'construction_type', 'nbr_bed_room',
       'nbr_full_baths', 'nbr_three_qtr_baths', 'nbr_half_baths',
       'nbr_rooms_nobath', 'total_finished_sqft', 'mainfloor_sqft',
       'bsmt_sqft', 'bsmt_type_code', 'bsmt_type', 'car_storage_type_code',
       'car_storage_type', 'ext_wall_code', 'ext_wall', 'ac', 'heating',
       'bsmt_finished', 'num_buildings'],
      dtype='object')

In [70]:
sales_total = sales_adj.merge(building[['strap', 'total_finished_sqft', 'effective_year_built', 'bsmt_finished', 'num_buildings']],
                              on = 'strap',
                              how = 'left').merge(land[['strap', 'sqft']],
                                                  on = 'strap',
                                                  how = 'left')
sales_total.rename(columns = {'sqft': 'total_land_sqft'}, inplace = True)
sales_total

Unnamed: 0,strap,sales_cd,sales_cd_dscr,price,market_area,multiplier,adjusted_sales_price,total_finished_sqft,effective_year_built,bsmt_finished,num_buildings,total_land_sqft
0,R0000008,Q,qualified,65000,102,,65000,1558.0,1954.0,False,1.0,6801
1,R0000019,U,unqualified,75000,109,,75000,942.0,1970.0,True,1.0,3606
2,R0000019,U,unqualified,110600,109,,110600,942.0,1970.0,True,1.0,3606
3,R0000019,Q,qualified,126400,109,,126400,942.0,1970.0,True,1.0,3606
4,R0000019,Q,qualified,332000,109,,332000,942.0,1970.0,True,1.0,3606
...,...,...,...,...,...,...,...,...,...,...,...,...
11163,R0610553,Q,qualified,2872600,102,1.2167,3495092,2611.0,2021.0,True,1.0,6987
11164,R0610553,Q,qualified,3450000,102,,3450000,2611.0,2021.0,True,1.0,6987
11165,R0612718,U,unqualified,910000,109,1.1910,1083810,2633.0,2024.0,True,1.0,7730
11166,R0613548,Q,qualified,850000,102,1.3453,1143505,1806.0,1988.0,False,1.0,10972


################## BREAK ########################### </br>
The data frame with all the relevent variables for plotting (what they recommended plus I added a num_buildings column) is in `sales_total`