In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import f_regression, SelectKBest, RFE
import os
from sklearn.impute import SimpleImputer
from env import get_db_url
import warnings
warnings.filterwarnings('ignore')
import math 
#from acquire import acquire_zillow

In [2]:
def acquire_zillow():
    '''
    This function checks for a copy of the dataset in the local directory 
    and pulls a new copy and saves it if there is not one,
    it then cleans the data by removing significant outliers then
    removing the rows with null values for 'yearbuilt'
    '''
    #assign the file name
    filename = 'zillow3.csv'
    #check if the file exists in the current directory and read it if it is
    if os.path.exists(filename):
        print('Reading from csv file...')
        #read the local .csv into the notebook
        df = pd.read_csv(filename)
        return df
    #assign the sql query to a variable for use in pulling a new copy of the dataset from the database
    query = '''
    SELECT pro.bedroomcnt, pro.bathroomcnt, pro.calculatedfinishedsquarefeet, 
    pro.taxvaluedollarcnt, pro.yearbuilt, pro.fips, pro.lotsizesquarefeet
    FROM properties_2017 AS pro
    JOIN predictions_2017 AS pre USING(parcelid)
    WHERE pro.propertylandusetypeid = 261;
    '''
    #if needed pull a fresh copy of the dataset from the database
    print('Getting a fresh copy from SQL database...')
    df = pd.read_sql(query, get_db_url('zillow'))
    #save a copy of the dataset to the local directory as a .csv file
    df.to_csv(filename, index=False)
    return df

In [3]:
df = acquire_zillow()
df.head()

Reading from csv file...


Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,fips,lotsizesquarefeet
0,4.0,3.5,3100.0,1023282.0,1998.0,6059.0,4506.0
1,2.0,1.0,1465.0,464000.0,1967.0,6111.0,12647.0
2,3.0,2.0,1243.0,564778.0,1962.0,6059.0,8432.0
3,4.0,3.0,2376.0,145143.0,1970.0,6037.0,13038.0
4,4.0,3.0,2962.0,773303.0,1950.0,6037.0,63000.0


In [4]:
df.shape

(52442, 7)

In [5]:
df.isnull().sum()

bedroomcnt                        0
bathroomcnt                       0
calculatedfinishedsquarefeet     82
taxvaluedollarcnt                 1
yearbuilt                       116
fips                              0
lotsizesquarefeet               369
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df.isnull().sum()

bedroomcnt                      0
bathroomcnt                     0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
yearbuilt                       0
fips                            0
lotsizesquarefeet               0
dtype: int64

In [8]:
df.shape

(51960, 7)

In [9]:
df.fips.value_counts()

6037.0    33568
6059.0    14034
6111.0     4358
Name: fips, dtype: int64

In [10]:
df1 = df[df['fips']==6037.0]

In [11]:
df1.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,fips,lotsizesquarefeet
3,4.0,3.0,2376.0,145143.0,1970.0,6037.0,13038.0
4,4.0,3.0,2962.0,773303.0,1950.0,6037.0,63000.0
5,2.0,1.0,738.0,218552.0,1922.0,6037.0,4214.0
6,4.0,3.0,3039.0,220583.0,1970.0,6037.0,20028.0
7,4.0,3.0,2540.0,430108.0,1999.0,6037.0,10384.0


In [12]:
df1.fips.value_counts()

6037.0    33568
Name: fips, dtype: int64

In [13]:
df2 = df[df['fips']==6059.0]

In [14]:
df2.fips.value_counts()

6059.0    14034
Name: fips, dtype: int64

In [15]:
df3 = df[df['fips']==6111.0]

In [16]:
df3.fips.value_counts()

6111.0    4358
Name: fips, dtype: int64

---

Data Frame 1 split, remove outliers, scale

In [21]:
train_val, test = train_test_split(df1, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
train.shape, validate.shape, test.shape

((18797, 7), (8057, 7), (6714, 7))

In [22]:
def remove_outliers(df, k, col_list):
    ''' this function will remove outliers from a list of columns in a dataframe 
        and return that dataframe. A list of columns with significant outliers is 
        assigned to a variable in the below wrangle function and can be modified if needed
    '''
    #loop throught the columns in the list
    for col in col_list:
        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        iqr = q3 - q1   # calculate interquartile range
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound
        # return dataframe without outliers
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)] 
    return df

In [23]:
out_columns = ['bedroomcnt', 'bathroomcnt','calculatedfinishedsquarefeet', 'lotsizesquarefeet']
train = remove_outliers(train, 1.5, out_columns)

In [24]:
train.shape

(15388, 7)

In [25]:
train.describe()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,fips,lotsizesquarefeet
count,15388.0,15388.0,15388.0,15388.0,15388.0,15388.0,15388.0
mean,3.084091,1.963088,1565.216012,377071.1,1954.808877,6037.0,6760.021445
std,0.768743,0.746954,543.386291,370037.1,21.523635,0.0,1904.171588
min,2.0,1.0,152.0,3254.0,1882.0,6037.0,1548.0
25%,3.0,1.0,1162.75,149737.0,1944.0,6037.0,5501.0
50%,3.0,2.0,1458.0,293251.5,1953.0,6037.0,6538.5
75%,4.0,2.0,1860.0,475612.2,1963.0,6037.0,7694.0
max,5.0,4.0,3298.0,9837579.0,2016.0,6037.0,12507.0


In [26]:
train.fips.value_counts()

6037.0    15388
Name: fips, dtype: int64

In [27]:
x_train = train[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt', 'lotsizesquarefeet']]
y_train = train[['taxvaluedollarcnt']]
x_train.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
17792,4.0,2.0,1864.0,1956.0,10271.0
45565,3.0,1.0,1168.0,1954.0,7238.0
51094,2.0,1.0,880.0,1924.0,3097.0
44347,4.0,2.0,1536.0,1913.0,4878.0
32759,3.0,2.0,1520.0,1954.0,6841.0


In [28]:
#create a min-max scaler object
scaler = MinMaxScaler()
#fit the scaler with the train data
scaler.fit(x_train)
#output the results of the scaler into a new df to use with feature selection
x_train_scaled = scaler.transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_train_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
17792,0.666667,0.333333,0.544183,0.552239,0.795967
45565,0.333333,0.0,0.32295,0.537313,0.519208
51094,0.0,0.0,0.231405,0.313433,0.141345
44347,0.666667,0.333333,0.439924,0.231343,0.30386
32759,0.333333,0.333333,0.434838,0.537313,0.482982


In [29]:
x_validate = validate[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet']]
y_validate = validate[['taxvaluedollarcnt']]
x_validate.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
21619,3.0,1.0,1006.0,1955.0,5929.0
9827,2.0,2.0,988.0,1948.0,2639.0
17521,3.0,2.0,1444.0,1955.0,6004.0
26823,3.0,1.0,1478.0,1911.0,5096.0
21839,4.0,5.0,4135.0,1993.0,5059.0


In [30]:
x_validate_scaled = scaler.transform(x_validate)
x_validate_scaled_df = pd.DataFrame(x_validate_scaled, columns=x_train.columns, index=x_validate.index)
x_validate_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
21619,0.333333,0.0,0.271456,0.544776,0.399763
9827,0.0,0.333333,0.265734,0.492537,0.099553
17521,0.333333,0.333333,0.41068,0.544776,0.406606
26823,0.333333,0.0,0.421488,0.216418,0.323752
21839,0.666667,1.333333,1.266052,0.828358,0.320376


In [31]:
predictions = pd.DataFrame({'actual': validate.taxvaluedollarcnt})
predictions.head()

Unnamed: 0,actual
21619,324551.0
9827,271013.0
17521,117015.0
26823,247786.0
21839,2690733.0


---

In [32]:
x = x_train_scaled_df
y = y_train
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2).fit(x, y)
print('Top 2 features according to RFE:')
x.columns[rfe.get_support()]

Top 2 features according to RFE:


Index(['calculatedfinishedsquarefeet', 'yearbuilt'], dtype='object')

In [33]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x.columns)

Unnamed: 0,rfe_ranking
bedroomcnt,3
bathroomcnt,2
calculatedfinishedsquarefeet,1
yearbuilt,1
lotsizesquarefeet,4


---

In [34]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet']]
lm = LinearRegression()
lm.fit(x, y_train)
predictions['simple_lm'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm
21619,324551.0,242309.058958
9827,271013.0,237971.349836
17521,117015.0,347859.980931
26823,247786.0,356053.431495
21839,2690733.0,996347.494698


In [35]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm_top2x'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x
21619,324551.0,242309.058958,208459.0
9827,271013.0,237971.349836,228139.7
17521,117015.0,347859.980931,339986.2
26823,247786.0,356053.431495,507879.0
21839,2690733.0,996347.494698,1011887.0


In [50]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm1'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0


In [51]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'bathroomcnt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'bathroomcnt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm2'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0


In [52]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'lotsizesquarefeet']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'lotsizesquarefeet']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm3'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0


In [36]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm_top3x'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x
21619,324551.0,242309.058958,208459.0,178279.0
9827,271013.0,237971.349836,228139.7,270605.2
17521,117015.0,347859.980931,339986.2,350057.6
26823,247786.0,356053.431495,507879.0,470324.6
21839,2690733.0,996347.494698,1011887.0,1036285.0


In [55]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bedroomcnt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bedroomcnt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm4'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0


In [56]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'lotsizesquarefeet']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'lotsizesquarefeet']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm5'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0


In [37]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm_top4x'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x
21619,324551.0,242309.058958,208459.0,178279.0,144399.0
9827,271013.0,237971.349836,228139.7,270605.2,328113.0
17521,117015.0,347859.980931,339986.2,350057.6,352437.2
26823,247786.0,356053.431495,507879.0,470324.6,441627.1
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0


In [59]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'bathroomcnt', 'bedroomcnt']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'bathroomcnt', 'bedroomcnt']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm6'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5,lm6
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6,190510.2
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7,319482.7
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3,353408.9
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9,318340.9
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0,1125464.0


In [38]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet']]
lm2 = LinearRegression()
lm.fit(x, y_train)
predictions['lm_allx'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0


In [39]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_train_scaled_df)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_scaled_df),
    columns=poly.get_feature_names(x_train_scaled_df.columns),
    index=x_train_scaled_df.index,
)
x_train_poly.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,bedroomcnt^2,bedroomcnt bathroomcnt,bedroomcnt calculatedfinishedsquarefeet,bedroomcnt yearbuilt,bedroomcnt lotsizesquarefeet,bathroomcnt^2,bathroomcnt calculatedfinishedsquarefeet,bathroomcnt yearbuilt,bathroomcnt lotsizesquarefeet,calculatedfinishedsquarefeet^2,calculatedfinishedsquarefeet yearbuilt,calculatedfinishedsquarefeet lotsizesquarefeet,yearbuilt^2,yearbuilt lotsizesquarefeet,lotsizesquarefeet^2
17792,0.666667,0.333333,0.544183,0.552239,0.795967,0.444444,0.222222,0.362789,0.368159,0.530645,0.111111,0.181394,0.18408,0.265322,0.296135,0.300519,0.433152,0.304968,0.439564,0.633563
45565,0.333333,0.0,0.32295,0.537313,0.519208,0.111111,0.0,0.10765,0.179104,0.173069,0.0,0.0,0.0,0.0,0.104297,0.173525,0.167678,0.288706,0.278977,0.269577
51094,0.0,0.0,0.231405,0.313433,0.141345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053548,0.07253,0.032708,0.09824,0.044302,0.019978
44347,0.666667,0.333333,0.439924,0.231343,0.30386,0.444444,0.222222,0.293282,0.154229,0.202573,0.111111,0.146641,0.077114,0.101287,0.193533,0.101773,0.133675,0.05352,0.070296,0.092331
32759,0.333333,0.333333,0.434838,0.537313,0.482982,0.111111,0.111111,0.144946,0.179104,0.160994,0.111111,0.144946,0.179104,0.160994,0.189084,0.233644,0.210019,0.288706,0.259513,0.233272


In [40]:
poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_validate_scaled_df)
predictions['polynomial degree 2'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0


In [41]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_scaled_df)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_scaled_df),
    columns=poly.get_feature_names(x_train_scaled_df.columns),
    index=x_train_scaled_df.index,
)
x_train_poly.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,bedroomcnt bathroomcnt,bedroomcnt calculatedfinishedsquarefeet,bedroomcnt yearbuilt,bedroomcnt lotsizesquarefeet,bathroomcnt calculatedfinishedsquarefeet,bathroomcnt yearbuilt,bathroomcnt lotsizesquarefeet,calculatedfinishedsquarefeet yearbuilt,calculatedfinishedsquarefeet lotsizesquarefeet,yearbuilt lotsizesquarefeet
17792,0.666667,0.333333,0.544183,0.552239,0.795967,0.222222,0.362789,0.368159,0.530645,0.181394,0.18408,0.265322,0.300519,0.433152,0.439564
45565,0.333333,0.0,0.32295,0.537313,0.519208,0.0,0.10765,0.179104,0.173069,0.0,0.0,0.0,0.173525,0.167678,0.278977
51094,0.0,0.0,0.231405,0.313433,0.141345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07253,0.032708,0.044302
44347,0.666667,0.333333,0.439924,0.231343,0.30386,0.222222,0.293282,0.154229,0.202573,0.146641,0.077114,0.101287,0.101773,0.133675,0.070296
32759,0.333333,0.333333,0.434838,0.537313,0.482982,0.111111,0.144946,0.179104,0.160994,0.144946,0.179104,0.160994,0.233644,0.210019,0.259513


In [42]:
poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_validate_scaled_df)
predictions['polynomial_interaction_only'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0


In [65]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial3'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5,lm6,polynomial3
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6,190510.2,244054.9
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7,319482.7,263385.7
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3,353408.9,333321.1
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9,318340.9,395878.2
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0,1125464.0,1501944.0


In [67]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial3_intsonly'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5,lm6,polynomial3,polynomial4,polynomial3_intsonly
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6,190510.2,244054.9,239377.8,244471.0
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7,319482.7,263385.7,310571.8,251894.3
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3,353408.9,333321.1,335431.1,335920.3
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9,318340.9,395878.2,369266.4,397059.9
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0,1125464.0,1501944.0,1727687.0,1480431.0


In [66]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial4'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5,lm6,polynomial3,polynomial4
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6,190510.2,244054.9,239377.8
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7,319482.7,263385.7,310571.8
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3,353408.9,333321.1,335431.1
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9,318340.9,395878.2,369266.4
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0,1125464.0,1501944.0,1727687.0


In [68]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt', 'bedroomcnt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial4_intsonly'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5,lm6,polynomial3,polynomial4,polynomial3_intsonly,polynomial4_intsonly
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6,190510.2,244054.9,239377.8,244471.0,238706.8
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7,319482.7,263385.7,310571.8,251894.3,294635.9
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3,353408.9,333321.1,335431.1,335920.3,343548.6
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9,318340.9,395878.2,369266.4,397059.9,378824.6
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0,1125464.0,1501944.0,1727687.0,1480431.0,1677504.0


In [71]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial5'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,lm2,lm3,lm4,lm5,lm6,polynomial3,polynomial4,polynomial3_intsonly,polynomial4_intsonly,polynomial5
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,236083.4,243805.9,187549.3,210374.6,190510.2,244054.9,239377.8,244471.0,238706.8,246926.4
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,248740.5,291257.0,265694.3,274839.7,319482.7,263385.7,310571.8,251894.3,294635.9,254691.8
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,350607.9,357315.5,339583.1,348383.3,353408.9,333321.1,335431.1,335920.3,343548.6,333944.4
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,341525.0,380615.9,492990.2,526593.9,318340.9,395878.2,369266.4,397059.9,378824.6,443715.8
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,1001794.0,1076989.0,1089493.0,1082015.0,1125464.0,1501944.0,1727687.0,1480431.0,1677504.0,1316588.0


In [72]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial5_intsonly'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,...,lm3,lm4,lm5,lm6,polynomial3,polynomial4,polynomial3_intsonly,polynomial4_intsonly,polynomial5,polynomial5_intsonly
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,...,243805.9,187549.3,210374.6,190510.2,244054.9,239377.8,244471.0,238706.8,246926.4,213119.677583
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,...,291257.0,265694.3,274839.7,319482.7,263385.7,310571.8,251894.3,294635.9,254691.8,225031.915294
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,...,357315.5,339583.1,348383.3,353408.9,333321.1,335431.1,335920.3,343548.6,333944.4,346418.048322
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,...,380615.9,492990.2,526593.9,318340.9,395878.2,369266.4,397059.9,378824.6,443715.8,496376.564926
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,...,1076989.0,1089493.0,1082015.0,1125464.0,1501944.0,1727687.0,1480431.0,1677504.0,1316588.0,902907.774727


In [75]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial6'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,...,lm4,lm5,lm6,polynomial3,polynomial4,polynomial3_intsonly,polynomial4_intsonly,polynomial5,polynomial5_intsonly,polynomial6
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,...,187549.3,210374.6,190510.2,244054.9,239377.8,244471.0,238706.8,246926.4,213119.677583,259569.7
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,...,265694.3,274839.7,319482.7,263385.7,310571.8,251894.3,294635.9,254691.8,225031.915294,257025.2
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,...,339583.1,348383.3,353408.9,333321.1,335431.1,335920.3,343548.6,333944.4,346418.048322,333910.3
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,...,492990.2,526593.9,318340.9,395878.2,369266.4,397059.9,378824.6,443715.8,496376.564926,340679.5
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,...,1089493.0,1082015.0,1125464.0,1501944.0,1727687.0,1480431.0,1677504.0,1316588.0,902907.774727,1314525.0


In [76]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions['polynomial6_intsonly'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline,lm1,...,lm5,lm6,polynomial3,polynomial4,polynomial3_intsonly,polynomial4_intsonly,polynomial5,polynomial5_intsonly,polynomial6,polynomial6_intsonly
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102,210322.1,...,210374.6,190510.2,244054.9,239377.8,244471.0,238706.8,246926.4,213119.677583,259569.7,242309.058958
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102,285965.5,...,274839.7,319482.7,263385.7,310571.8,251894.3,294635.9,254691.8,225031.915294,257025.2,237971.349836
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102,346276.6,...,348383.3,353408.9,333321.1,335431.1,335920.3,343548.6,333944.4,346418.048322,333910.3,347859.980931
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102,356830.2,...,526593.9,318340.9,395878.2,369266.4,397059.9,378824.6,443715.8,496376.564926,340679.5,356053.431495
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102,1100328.0,...,1082015.0,1125464.0,1501944.0,1727687.0,1480431.0,1677504.0,1316588.0,902907.774727,1314525.0,996347.494698


In [46]:
predictions['baseline'] = train.taxvaluedollarcnt.mean()
predictions.head()

Unnamed: 0,actual,simple_lm,lm_top2x,lm_top3x,lm_top4x,lm_allx,polynomial degree 2,polynomial_interaction_only,baseline
21619,324551.0,242309.058958,208459.0,178279.0,144399.0,147394.3,232244.4,241646.0,377071.081102
9827,271013.0,237971.349836,228139.7,270605.2,328113.0,368966.5,427572.1,365739.9,377071.081102
17521,117015.0,347859.980931,339986.2,350057.6,352437.2,359782.9,339090.4,353712.3,377071.081102
26823,247786.0,356053.431495,507879.0,470324.6,441627.1,460159.1,349836.1,354983.6,377071.081102
21839,2690733.0,996347.494698,1011887.0,1036285.0,1137089.0,1199510.0,1996239.0,1892350.0,377071.081102


In [77]:
def calculate_mse(y_predicted):
    return mean_squared_error(predictions.actual, y_predicted)

predictions.apply(calculate_mse)

actual                         0.000000e+00
simple_lm                      3.928276e+11
lm_top2x                       3.740913e+11
lm_top3x                       3.737984e+11
lm_top4x                       3.622688e+11
lm_allx                        1.584328e+12
polynomial degree 2            8.962272e+18
polynomial_interaction_only    3.982990e+12
baseline                       5.412900e+11
lm1                            3.777704e+11
lm2                            3.934137e+11
lm3                            2.303325e+12
lm4                            3.649350e+11
lm5                            1.836986e+12
lm6                            3.775003e+11
polynomial3                    3.278966e+11
polynomial4                    3.400347e+11
polynomial3_intsonly           3.243613e+11
polynomial4_intsonly           3.258536e+11
polynomial5                    3.378582e+11
polynomial5_intsonly           3.832959e+11
polynomial6                    3.359110e+11
polynomial6_intsonly           3

In [31]:
import math 

In [78]:
predictions.apply(calculate_mse).apply(math.sqrt)

actual                         0.000000e+00
simple_lm                      6.267596e+05
lm_top2x                       6.116301e+05
lm_top3x                       6.113905e+05
lm_top4x                       6.018877e+05
lm_allx                        1.258701e+06
polynomial degree 2            2.993705e+09
polynomial_interaction_only    1.995743e+06
baseline                       7.357241e+05
lm1                            6.146303e+05
lm2                            6.272270e+05
lm3                            1.517671e+06
lm4                            6.040985e+05
lm5                            1.355354e+06
lm6                            6.144106e+05
polynomial3                    5.726226e+05
polynomial4                    5.831249e+05
polynomial3_intsonly           5.695273e+05
polynomial4_intsonly           5.708359e+05
polynomial5                    5.812557e+05
polynomial5_intsonly           6.191090e+05
polynomial6                    5.795783e+05
polynomial6_intsonly           6

---

**DF2**

In [17]:
train_val, test = train_test_split(df2, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
train.shape, validate.shape, test.shape

((7858, 7), (3369, 7), (2807, 7))

In [18]:
def remove_outliers(df, k, col_list):
    ''' this function will remove outliers from a list of columns in a dataframe 
        and return that dataframe. A list of columns with significant outliers is 
        assigned to a variable in the below wrangle function and can be modified if needed
    '''
    #loop throught the columns in the list
    for col in col_list:
        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        iqr = q3 - q1   # calculate interquartile range
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound
        # return dataframe without outliers
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)] 
    return df

In [19]:
out_columns = ['bedroomcnt', 'bathroomcnt','calculatedfinishedsquarefeet', 'lotsizesquarefeet']
train = remove_outliers(train, 1.5, out_columns)

In [20]:
train.shape

(6783, 7)

In [21]:
train.fips.value_counts()

6059.0    6783
Name: fips, dtype: int64

In [22]:
x_train = train[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt', 'lotsizesquarefeet']]
y_train = train[['taxvaluedollarcnt']]
x_train.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
34201,4.0,2.5,2269.0,1987.0,3575.0
12805,4.0,2.0,1522.0,1957.0,6865.0
36501,4.0,3.5,2902.0,2004.0,5045.0
40593,5.0,3.0,2396.0,2003.0,4239.0
8354,4.0,2.5,2003.0,1998.0,3842.0


In [23]:
#create a min-max scaler object
scaler = MinMaxScaler()
#fit the scaler with the train data
scaler.fit(x_train)
#output the results of the scaler into a new df to use with feature selection
x_train_scaled = scaler.transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_train_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
34201,0.666667,0.5,0.516081,0.770492,0.260292
12805,0.666667,0.333333,0.295662,0.52459,0.542453
36501,0.666667,0.833333,0.702862,0.909836,0.386364
40593,1.0,0.666667,0.553556,0.901639,0.317238
8354,0.666667,0.5,0.437592,0.860656,0.28319


In [24]:
x_validate = validate[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet']]
y_validate = validate[['taxvaluedollarcnt']]
x_validate.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
26610,4.0,2.0,2286.0,1966.0,5200.0
43427,4.0,2.0,1748.0,1959.0,6120.0
13685,3.0,2.0,1592.0,1960.0,6600.0
27127,3.0,2.5,1801.0,1986.0,4950.0
44264,5.0,4.0,3831.0,1999.0,13005.0


In [25]:
x_validate_scaled = scaler.transform(x_validate)
x_validate_scaled_df = pd.DataFrame(x_validate_scaled, columns=x_train.columns, index=x_validate.index)
x_validate_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
26610,0.666667,0.333333,0.521098,0.598361,0.399657
43427,0.666667,0.333333,0.362349,0.540984,0.478559
13685,0.333333,0.333333,0.316317,0.54918,0.519726
27127,0.333333,0.5,0.377988,0.762295,0.378216
44264,1.0,1.0,0.976984,0.868852,1.069039


In [26]:
predictions2 = pd.DataFrame({'actual': validate.taxvaluedollarcnt})
predictions2.head()

Unnamed: 0,actual
26610,101539.0
43427,497056.0
13685,715000.0
27127,311209.0
44264,1750000.0


In [28]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions2['polynomial3_intsonly'] = poly_rm.predict(x_validate_poly)
predictions2.head()

Unnamed: 0,actual,polynomial3_intsonly
26610,101539.0,491994.2
43427,497056.0,389856.3
13685,715000.0,368790.4
27127,311209.0,494206.8
44264,1750000.0,1314056.0


In [34]:
predictions2['baseline'] = train.taxvaluedollarcnt.mean()
predictions2.head()

Unnamed: 0,actual,polynomial3_intsonly,baseline
26610,101539.0,491994.2,501488.803332
43427,497056.0,389856.3,501488.803332
13685,715000.0,368790.4,501488.803332
27127,311209.0,494206.8,501488.803332
44264,1750000.0,1314056.0,501488.803332


In [35]:
def calculate_mse(y_predicted):
    return mean_squared_error(predictions2.actual, y_predicted)

predictions2.apply(calculate_mse)

actual                  0.000000e+00
polynomial3_intsonly    2.008305e+11
baseline                4.033132e+11
dtype: float64

In [36]:
predictions2.apply(calculate_mse).apply(math.sqrt)

actual                       0.000000
polynomial3_intsonly    448141.166948
baseline                635069.479185
dtype: float64

---

**DF3**

In [17]:
train_val, test = train_test_split(df3, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
train.shape, validate.shape, test.shape

((2440, 7), (1046, 7), (872, 7))

In [18]:
def remove_outliers(df, k, col_list):
    ''' this function will remove outliers from a list of columns in a dataframe 
        and return that dataframe. A list of columns with significant outliers is 
        assigned to a variable in the below wrangle function and can be modified if needed
    '''
    #loop throught the columns in the list
    for col in col_list:
        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        iqr = q3 - q1   # calculate interquartile range
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound
        # return dataframe without outliers
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)] 
    return df

In [19]:
out_columns = ['bedroomcnt', 'bathroomcnt','calculatedfinishedsquarefeet', 'lotsizesquarefeet']
train = remove_outliers(train, 1.5, out_columns)

In [20]:
train.shape

(2013, 7)

In [21]:
train.fips.value_counts()

6111.0    2013
Name: fips, dtype: int64

In [22]:
x_train = train[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt', 'lotsizesquarefeet']]
y_train = train[['taxvaluedollarcnt']]
x_train.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
43436,3.0,1.5,1376.0,1962.0,10455.0
2018,4.0,2.0,1701.0,1965.0,6700.0
27370,3.0,2.0,1160.0,1973.0,7095.0
46312,4.0,2.0,1956.0,1964.0,9100.0
13484,4.0,1.5,1145.0,1962.0,10260.0


In [23]:
#create a min-max scaler object
scaler = MinMaxScaler()
#fit the scaler with the train data
scaler.fit(x_train)
#output the results of the scaler into a new df to use with feature selection
x_train_scaled = scaler.transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_train_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
43436,0.333333,0.166667,0.274847,0.582677,0.651579
2018,0.666667,0.333333,0.37454,0.606299,0.381377
27370,0.333333,0.333333,0.208589,0.669291,0.409801
46312,0.666667,0.333333,0.452761,0.598425,0.554076
13484,0.666667,0.166667,0.203988,0.582677,0.637548


In [24]:
x_validate = validate[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet']]
y_validate = validate[['taxvaluedollarcnt']]
x_validate.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
13404,5.0,4.5,3622.0,2006.0,10908.0
2305,4.0,4.5,4007.0,2000.0,25922.0
11329,6.0,7.0,7770.0,2004.0,46980.0
9365,3.0,3.0,1894.0,1959.0,11314.0
7841,4.0,2.0,1961.0,1962.0,9000.0


In [25]:
x_validate_scaled = scaler.transform(x_validate)
x_validate_scaled_df = pd.DataFrame(x_validate_scaled, columns=x_train.columns, index=x_validate.index)
x_validate_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet
13404,1.0,1.166667,0.963804,0.929134,0.684176
2305,0.666667,1.166667,1.081902,0.88189,1.764554
11329,1.333333,2.0,2.236196,0.913386,3.279845
9365,0.333333,0.666667,0.433742,0.559055,0.713391
7841,0.666667,0.333333,0.454294,0.582677,0.546881


In [26]:
predictions3 = pd.DataFrame({'actual': validate.taxvaluedollarcnt})
predictions3.head()

Unnamed: 0,actual
13404,849000.0
2305,1298990.0
11329,1636139.0
9365,302961.0
7841,376973.0


In [27]:
x_train_poly_cols = x_train_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
x_val_poly_cols = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'yearbuilt', 'bathroomcnt']]
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_poly_cols)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_poly_cols),
    columns=poly.get_feature_names(x_train_poly_cols.columns),
    index=x_train_poly_cols.index,
)

poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_val_poly_cols)
predictions3['polynomial3_intsonly'] = poly_rm.predict(x_validate_poly)
predictions3.head()

Unnamed: 0,actual,polynomial3_intsonly
13404,849000.0,1021642.0
2305,1298990.0,1112475.0
11329,1636139.0,2868169.0
9365,302961.0,388215.9
7841,376973.0,375864.2


In [28]:
predictions3['baseline'] = train.taxvaluedollarcnt.mean()
predictions3.head()

Unnamed: 0,actual,polynomial3_intsonly,baseline
13404,849000.0,1021642.0,422171.300546
2305,1298990.0,1112475.0,422171.300546
11329,1636139.0,2868169.0,422171.300546
9365,302961.0,388215.9,422171.300546
7841,376973.0,375864.2,422171.300546


In [29]:
def calculate_mse(y_predicted):
    return mean_squared_error(predictions3.actual, y_predicted)

predictions3.apply(calculate_mse)

actual                  0.000000e+00
polynomial3_intsonly    1.956480e+11
baseline                3.904149e+11
dtype: float64

In [30]:
predictions3.apply(calculate_mse).apply(math.sqrt)

actual                       0.000000
polynomial3_intsonly    442321.150865
baseline                624831.936752
dtype: float64