In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import f_regression, SelectKBest, RFE
import os
from sklearn.impute import SimpleImputer
from env import get_db_url
import warnings
warnings.filterwarnings('ignore')

#from acquire import acquire_zillow

In [2]:
def acquire_zillow():
    '''
    This function checks for a copy of the dataset in the local directory 
    and pulls a new copy and saves it if there is not one,
    it then cleans the data by removing significant outliers then
    removing the rows with null values for 'yearbuilt'
    '''
    #assign the file name
    filename = 'zillow2.csv'
    #check if the file exists in the current directory and read it if it is
    if os.path.exists(filename):
        print('Reading from csv file...')
        #read the local .csv into the notebook
        df = pd.read_csv(filename)
        return df
    #assign the sql query to a variable for use in pulling a new copy of the dataset from the database
    query = '''
    SELECT pro.bedroomcnt, pro.bathroomcnt, pro.calculatedfinishedsquarefeet, 
    pro.taxvaluedollarcnt, pro.yearbuilt, pro.fips, pro.lotsizesquarefeet, pro.numberofstories,
    pro.poolcnt, pro.garagecarcnt
    FROM properties_2017 AS pro
    JOIN predictions_2017 AS pre USING(parcelid)
    WHERE pro.propertylandusetypeid = 261;
    '''
    #if needed pull a fresh copy of the dataset from the database
    print('Getting a fresh copy from SQL database...')
    df = pd.read_sql(query, get_db_url('zillow'))
    #save a copy of the dataset to the local directory as a .csv file
    df.to_csv(filename, index=False)
    return df

In [3]:
df = acquire_zillow()
df.head()

Reading from csv file...


Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,fips,lotsizesquarefeet,numberofstories,poolcnt,garagecarcnt
0,4.0,3.5,3100.0,1023282.0,1998.0,6059.0,4506.0,,,2.0
1,2.0,1.0,1465.0,464000.0,1967.0,6111.0,12647.0,1.0,,1.0
2,3.0,2.0,1243.0,564778.0,1962.0,6059.0,8432.0,1.0,1.0,2.0
3,4.0,3.0,2376.0,145143.0,1970.0,6037.0,13038.0,,1.0,
4,4.0,3.0,2962.0,773303.0,1950.0,6037.0,63000.0,,1.0,


---

**Prep data**

In [4]:
imputer = SimpleImputer(strategy='median')

imputer.fit(df[['garagecarcnt']])
df[['garagecarcnt']] = imputer.transform(df[['garagecarcnt']])

imputer.fit(df[['numberofstories']])
df[['numberofstories']] = imputer.transform(df[['numberofstories']])

In [5]:
df['poolcnt'] = df.poolcnt.fillna(0)

In [6]:
df = df.dropna()

In [7]:
df.shape

(51960, 10)

In [8]:
df.isnull().sum()

bedroomcnt                      0
bathroomcnt                     0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
yearbuilt                       0
fips                            0
lotsizesquarefeet               0
numberofstories                 0
poolcnt                         0
garagecarcnt                    0
dtype: int64

----

**Split, remove outliers, and Scale**

In [9]:
train_val, test = train_test_split(df, train_size = 0.8, random_state=123)
train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
train.shape, validate.shape, test.shape

((29097, 10), (12471, 10), (10392, 10))

In [10]:
def remove_outliers(df, k, col_list):
    ''' this function will remove outliers from a list of columns in a dataframe 
        and return that dataframe. A list of columns with significant outliers is 
        assigned to a variable in the below wrangle function and can be modified if needed
    '''
    #loop throught the columns in the list
    for col in col_list:
        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        iqr = q3 - q1   # calculate interquartile range
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound
        # return dataframe without outliers
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)] 
    return df

In [11]:
out_columns = ['bedroomcnt', 'bathroomcnt','calculatedfinishedsquarefeet', 'lotsizesquarefeet']
train = remove_outliers(train, 1.5, out_columns)

In [12]:
train.shape

(24005, 10)

In [13]:
x_train = train[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt', 'lotsizesquarefeet', 
                 'numberofstories', 'poolcnt', 'garagecarcnt']]
y_train = train[['taxvaluedollarcnt']]
x_train.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,numberofstories,poolcnt,garagecarcnt
31245,3.0,3.0,2493.0,1976.0,7800.0,3.0,1.0,2.0
34109,4.0,3.0,2460.0,1981.0,7700.0,2.0,0.0,2.0
50986,2.0,1.0,940.0,1950.0,5255.0,1.0,0.0,2.0
27085,3.0,1.0,988.0,1953.0,6341.0,1.0,0.0,2.0
39528,3.0,2.0,1954.0,1970.0,11586.0,1.0,1.0,2.0


In [14]:
#create a min-max scaler object
scaler = MinMaxScaler()
#fit the scaler with the train data
scaler.fit(x_train)
#output the results of the scaler into a new df to use with feature selection
x_train_scaled = scaler.transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_train_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,numberofstories,poolcnt,garagecarcnt
31245,0.333333,0.666667,0.663043,0.706767,0.582664,0.4,1.0,0.333333
34109,0.666667,0.666667,0.652174,0.744361,0.573634,0.2,0.0,0.333333
50986,0.0,0.0,0.151515,0.511278,0.352867,0.0,0.0,0.333333
27085,0.333333,0.0,0.167325,0.533835,0.450926,0.0,0.0,0.333333
39528,0.333333,0.333333,0.485507,0.661654,0.924515,0.0,1.0,0.333333


In [15]:
x_validate = validate[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet', 
                       'numberofstories', 'poolcnt', 'garagecarcnt']]
y_validate = validate[['taxvaluedollarcnt']]
x_validate.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,numberofstories,poolcnt,garagecarcnt
14676,4.0,3.0,2994.0,2000.0,9218.0,2.0,1.0,2.0
48753,4.0,3.0,3308.0,1988.0,7351.0,1.0,1.0,2.0
25931,4.0,3.0,2114.0,1942.0,7027.0,1.0,0.0,2.0
31299,2.0,2.5,1300.0,1986.0,784.0,1.0,0.0,2.0
19476,4.0,3.0,3097.0,1980.0,5000.0,2.0,0.0,2.0


In [16]:
x_validate_scaled = scaler.transform(x_validate)
x_validate_scaled_df = pd.DataFrame(x_validate_scaled, columns=x_train.columns, index=x_validate.index)
x_validate_scaled_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,numberofstories,poolcnt,garagecarcnt
14676,0.666667,0.666667,0.828063,0.887218,0.7107,0.2,1.0,0.333333
48753,0.666667,0.666667,0.931489,0.796992,0.542122,0.0,1.0,0.333333
25931,0.666667,0.666667,0.538208,0.451128,0.512867,0.0,0.0,0.333333
31299,0.0,0.5,0.270092,0.781955,-0.050835,0.0,0.0,0.333333
19476,0.666667,0.666667,0.861989,0.736842,0.329842,0.2,0.0,0.333333


---

In [17]:
predictions = pd.DataFrame({'actual': validate.taxvaluedollarcnt})
predictions.head()

Unnamed: 0,actual
14676,837000.0
48753,1842789.0
25931,555715.0
31299,250791.0
19476,724289.0


---

Auto Feature Selection

In [18]:
x = x_train_scaled_df
y = y_train
kbest = SelectKBest(f_regression, k=2)
kbest.fit(x, y)
print('Top 2 features according to k-best:')
x.columns[kbest.get_support()]

Top 2 features according to k-best:


Index(['bathroomcnt', 'calculatedfinishedsquarefeet'], dtype='object')

In [19]:
x = x_train_scaled_df
y = y_train
kbest = SelectKBest(f_regression, k=1)
kbest.fit(x, y)
print('Top 1 features according to k-best:')
x.columns[kbest.get_support()]

Top 1 features according to k-best:


Index(['calculatedfinishedsquarefeet'], dtype='object')

In [20]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2).fit(x, y)
print('Top 2 features according to RFE:')
x.columns[rfe.get_support()]

Top 2 features according to RFE:


Index(['bedroomcnt', 'calculatedfinishedsquarefeet'], dtype='object')

In [21]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x.columns)

Unnamed: 0,rfe_ranking
bedroomcnt,1
bathroomcnt,3
calculatedfinishedsquarefeet,1
yearbuilt,2
lotsizesquarefeet,6
numberofstories,5
poolcnt,7
garagecarcnt,4


**Model**

In [22]:
x = x_train_scaled_df[['calculatedfinishedsquarefeet']]
z = x_validate_scaled_df[['calculatedfinishedsquarefeet']]
lm = LinearRegression()
lm.fit(x, y_train)
predictions['simple_lm'] = lm.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm
14676,837000.0,761965.913481
48753,1842789.0,844584.770184
25931,555715.0,530422.62081
31299,250791.0,316245.075089
19476,724289.0,789067.003419


In [23]:
mr = LinearRegression()
rfe_top_two = x_train_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt']]
mr.fit(rfe_top_two, y_train)
val_rfe_top_two = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt']]
predictions['top_two_rfe'] = mr.predict(val_rfe_top_two)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe
14676,837000.0,761965.913481,780771.807842
48753,1842789.0,844584.770184,883912.609388
25931,555715.0,530422.62081,491714.784399
31299,250791.0,316245.075089,391875.105126
19476,724289.0,789067.003419,814604.61854


In [25]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_train_scaled_df)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_scaled_df),
    columns=poly.get_feature_names(x_train_scaled_df.columns),
    index=x_train_scaled_df.index,
)
x_train_poly.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,yearbuilt,lotsizesquarefeet,numberofstories,poolcnt,garagecarcnt,bedroomcnt^2,bedroomcnt bathroomcnt,...,lotsizesquarefeet^2,lotsizesquarefeet numberofstories,lotsizesquarefeet poolcnt,lotsizesquarefeet garagecarcnt,numberofstories^2,numberofstories poolcnt,numberofstories garagecarcnt,poolcnt^2,poolcnt garagecarcnt,garagecarcnt^2
31245,0.333333,0.666667,0.663043,0.706767,0.582664,0.4,1.0,0.333333,0.111111,0.222222,...,0.339497,0.233065,0.582664,0.194221,0.16,0.4,0.133333,1.0,0.333333,0.111111
34109,0.666667,0.666667,0.652174,0.744361,0.573634,0.2,0.0,0.333333,0.444444,0.444444,...,0.329056,0.114727,0.0,0.191211,0.04,0.0,0.066667,0.0,0.0,0.111111
50986,0.0,0.0,0.151515,0.511278,0.352867,0.0,0.0,0.333333,0.0,0.0,...,0.124515,0.0,0.0,0.117622,0.0,0.0,0.0,0.0,0.0,0.111111
27085,0.333333,0.0,0.167325,0.533835,0.450926,0.0,0.0,0.333333,0.111111,0.0,...,0.203334,0.0,0.0,0.150309,0.0,0.0,0.0,0.0,0.0,0.111111
39528,0.333333,0.333333,0.485507,0.661654,0.924515,0.0,1.0,0.333333,0.111111,0.111111,...,0.854727,0.0,0.924515,0.308172,0.0,0.0,0.0,1.0,0.333333,0.111111


In [26]:
poly_rm = LinearRegression()
poly_rm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_validate_scaled_df)
predictions['polynomial degree 2'] = poly_rm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2
14676,837000.0,761965.913481,780771.807842,647386.072889
48753,1842789.0,844584.770184,883912.609388,879557.752779
25931,555715.0,530422.62081,491714.784399,562777.021149
31299,250791.0,316245.075089,391875.105126,596488.673883
19476,724289.0,789067.003419,814604.61854,793444.483055


In [27]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_train_scaled_df)
x_train_poly = pd.DataFrame(
    poly.transform(x_train_scaled_df),
    columns=poly.get_feature_names(x_train_scaled_df.columns),
    index=x_train_scaled_df.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)

x_validate_poly = poly.transform(x_validate_scaled_df)
predictions['polynomial only interaction'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639


In [28]:
predictions['baseline'] = train.taxvaluedollarcnt.mean()
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491


In [33]:
mr = LinearRegression()
x = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet', 
                       'numberofstories', 'poolcnt', 'garagecarcnt']]
mr.fit(x, y_train)
z = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet', 
                       'numberofstories', 'poolcnt', 'garagecarcnt']]
predictions['all_x_mr'] = mr.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544


In [37]:
mr = LinearRegression()
x = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet']]
mr.fit(x, y_train)
z = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','lotsizesquarefeet']]
predictions['fivex_mr'] = mr.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315


In [41]:
mr = LinearRegression()
x = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]
mr.fit(x, y_train)
z = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt', 'garagecarcnt']]
predictions['top_fivex_mr'] = mr.predict(z)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638


In [46]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]


poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_only_interaction_top5x'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929


In [50]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]


poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_top5x'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592


In [54]:
mr = LinearRegression()
rfe_top_two = x_train_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt', 'yearbuilt']]
mr.fit(rfe_top_two, y_train)
val_rfe_top_two = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt', 'yearbuilt']]
predictions['top_three_rfe'] = mr.predict(val_rfe_top_two)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395


In [58]:
mr = LinearRegression()
rfe_top_two = x_train_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt', 'yearbuilt', 'bathroomcnt']]
mr.fit(rfe_top_two, y_train)
val_rfe_top_two = x_validate_scaled_df[['calculatedfinishedsquarefeet', 'bedroomcnt', 'yearbuilt', 'bathroomcnt']]
predictions['top_four_rfe'] = mr.predict(val_rfe_top_two)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe,top_four_rfe
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147,723401.118915
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024,836584.73361
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138,583493.158876
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241,403014.027593
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395,791609.612672


In [61]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt']]


poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_top4x'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe,top_four_rfe,polynomial_top4x
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147,723401.118915,718826.838203
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024,836584.73361,858911.792566
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138,583493.158876,578343.973852
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241,403014.027593,357401.368436
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395,791609.612672,815892.001467


In [64]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt']]


poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_top4x_intonly'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe,top_four_rfe,polynomial_top4x,polynomial_top4x_intonly
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147,723401.118915,718826.838203,725537.807135
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024,836584.73361,858911.792566,845818.405968
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138,583493.158876,578343.973852,583637.29795
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241,403014.027593,357401.368436,379468.936483
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395,791609.612672,815892.001467,785676.634665


In [67]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]


poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_top5x_3d'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe,top_four_rfe,polynomial_top4x,polynomial_top4x_intonly,polynomial_top5x_3d
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147,723401.118915,718826.838203,725537.807135,718329.946546
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024,836584.73361,858911.792566,845818.405968,790185.930161
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138,583493.158876,578343.973852,583637.29795,598579.067897
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241,403014.027593,357401.368436,379468.936483,373940.958133
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395,791609.612672,815892.001467,785676.634665,723665.611111


In [70]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]


poly = PolynomialFeatures(degree=5, include_bias=False, interaction_only=False)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_top5x_5d'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe,top_four_rfe,polynomial_top4x,polynomial_top4x_intonly,polynomial_top5x_3d,polynomial_top5x_5d
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147,723401.118915,718826.838203,725537.807135,718329.946546,695803.043491
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024,836584.73361,858911.792566,845818.405968,790185.930161,749979.043491
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138,583493.158876,578343.973852,583637.29795,598579.067897,662187.043491
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241,403014.027593,357401.368436,379468.936483,373940.958133,398792.070759
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395,791609.612672,815892.001467,785676.634665,723665.611111,674035.043491


In [73]:
x_cols_p = x_train_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]
x_val_p = x_validate_scaled_df[['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet', 'yearbuilt','garagecarcnt']]


poly = PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)
poly.fit(x_cols_p)


x_train_poly = pd.DataFrame(
    poly.transform(x_cols_p),
    columns=poly.get_feature_names(x_cols_p.columns),
    index=x_cols_p.index,
)
lm = LinearRegression()
lm.fit(x_train_poly, y_train)


x_validate_poly = poly.transform(x_val_p)
predictions['polynomial_top5x_1d'] = lm.predict(x_validate_poly)
predictions.head()

Unnamed: 0,actual,simple_lm,top_two_rfe,polynomial degree 2,polynomial only interaction,baseline,all_x_mr,fivex_mr,top_fivex_mr,polynomial_only_interaction_top5x,polynomial_top5x,top_three_rfe,top_four_rfe,polynomial_top4x,polynomial_top4x_intonly,polynomial_top5x_3d,polynomial_top5x_5d,polynomial_top5x_1d
14676,837000.0,761965.913481,780771.807842,647386.072889,639552.074233,416615.043491,687453.380066,700014.977604,723951.447061,719501.681949,724936.962309,759733.45147,723401.118915,718826.838203,725537.807135,718329.946546,695803.043491,723951.447061
48753,1842789.0,844584.770184,883912.609388,879557.752779,881353.438781,416615.043491,900415.885328,847577.169799,837894.55579,860814.051949,845862.038111,885191.76024,836584.73361,858911.792566,845818.405968,790185.930161,749979.043491,837894.55579
25931,555715.0,530422.62081,491714.784399,562777.021149,580089.827677,416615.043491,585779.071238,590236.273567,582587.160128,577813.091535,582598.12688,529506.85138,583493.158876,578343.973852,583637.29795,598579.067897,662187.043491,582587.160128
31299,250791.0,316245.075089,391875.105126,596488.673883,511204.675258,416615.043491,480692.075187,466101.848782,400838.362155,351509.838755,373619.963863,346357.262241,403014.027593,357401.368436,379468.936483,373940.958133,398792.070759,400838.362155
19476,724289.0,789067.003419,814604.61854,793444.483055,789859.144639,416615.043491,787660.933544,834234.857315,792526.915638,817064.351929,785608.482592,822165.371395,791609.612672,815892.001467,785676.634665,723665.611111,674035.043491,792526.915638


In [74]:
def calculate_mse(y_predicted):
    return mean_squared_error(predictions.actual, y_predicted)

predictions.apply(calculate_mse)

actual                               0.000000e+00
simple_lm                            3.822202e+11
top_two_rfe                          3.654268e+11
polynomial degree 2                  4.450791e+18
polynomial only interaction          7.872580e+12
baseline                             5.741742e+11
all_x_mr                             2.172786e+12
fivex_mr                             1.565772e+12
top_fivex_mr                         3.572492e+11
polynomial_only_interaction_top5x    3.287030e+11
polynomial_top5x                     3.609560e+11
top_three_rfe                        3.582925e+11
top_four_rfe                         3.572085e+11
polynomial_top4x                     3.289338e+11
polynomial_top4x_intonly             3.620242e+11
polynomial_top5x_3d                  2.307099e+12
polynomial_top5x_5d                  4.747469e+34
polynomial_top5x_1d                  3.572492e+11
dtype: float64

In [56]:
import math 

In [75]:
predictions.apply(calculate_mse).apply(math.sqrt)

actual                               0.000000e+00
simple_lm                            6.182396e+05
top_two_rfe                          6.045054e+05
polynomial degree 2                  2.109690e+09
polynomial only interaction          2.805812e+06
baseline                             7.577428e+05
all_x_mr                             1.474037e+06
fivex_mr                             1.251308e+06
top_fivex_mr                         5.977033e+05
polynomial_only_interaction_top5x    5.733263e+05
polynomial_top5x                     6.007962e+05
top_three_rfe                        5.985754e+05
top_four_rfe                         5.976692e+05
polynomial_top4x                     5.735275e+05
polynomial_top4x_intonly             6.016845e+05
polynomial_top5x_3d                  1.518914e+06
polynomial_top5x_5d                  2.178869e+17
polynomial_top5x_1d                  5.977033e+05
dtype: float64

---

**rough work**

In [None]:
df.shape

In [None]:
df.info(null_counts=True)

In [None]:
df.isnull().sum()

In [None]:
df.numberofstories.mean()

In [None]:
df.numberofstories.median()

In [None]:
df.numberofstories.value_counts()

In [None]:
df.lotsizesquarefeet.min()

In [None]:
df.lotsizesquarefeet.max()

In [None]:
df.garagecarcnt.mean()

In [None]:
df.garagecarcnt.median()

In [None]:
df.garagecarcnt.value_counts()

In [None]:
imputer = SimpleImputer(strategy='median')

imputer.fit(df[['garagecarcnt']])

df[['garagecarcnt']] = imputer.transform(df[['garagecarcnt']])

In [None]:
imputer.fit(df[['numberofstories']])
df[['numberofstories']] = imputer.transform(df[['numberofstories']])

In [None]:
df['poolcnt'] = df.poolcnt.fillna(0)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
def remove_outliers(df, k, col_list):
    ''' this function will remove outliers from a list of columns in a dataframe 
        and return that dataframe. A list of columns with significant outliers is 
        assigned to a variable in the below wrangle function and can be modified if needed
    '''
    #loop throught the columns in the list
    for col in col_list:
        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        iqr = q3 - q1   # calculate interquartile range
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound
        # return dataframe without outliers
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)] 
    return df

In [None]:
df.numberofstories.value_counts()

In [None]:
df.poolcnt.value_counts()

In [None]:
df.garagecarcnt.value_counts()

In [None]:
out_columns = ['bedroomcnt', 'bathroomcnt','calculatedfinishedsquarefeet', 'lotsizesquarefeet']
df = remove_outliers(df, 1.5, out_columns)
df.head()

In [None]:
df.shape

In [None]:
df.garagecarcnt.value_counts()

In [None]:
df.garagecarcnt.value_counts()

In [None]:
df.shape

In [None]:
df.numberofstories.value_counts()

In [None]:
df.numberofstories.value_counts()

In [None]:
df.bedroomcnt.value_counts()

In [None]:
df.bedroomcnt.value_counts()

In [None]:
df.bathroomcnt.value_counts()

In [None]:
df.bathroomcnt.value_counts()