In [29]:
import pandas as pd
import numpy as np
import wrangle as w
import matplotlib.pyplot as plt
import seaborn as sns
import env

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE, SelectKBest, f_regression
import warnings
warnings.filterwarnings("ignore")

from pydataset import data

from sklearn.preprocessing import RobustScaler

import matplotlib as mpl
mpl.rcParams["axes.formatter.useoffset"] = False

### 1. Select a dataset with a continuous target variable.



In [2]:
df = w.get_zillow_2017()
df = w.wrangle_zillow()

In [3]:
w.check_columns(df)

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Number of Null Values,Proportion of Null Values,dtype
6,county,3,"[LA, Orange, Ventura]",0,0.0,object
0,bedrooms,19,"[4, 3, 0, 5, 2, 1, 6, 7, 8, 9, 25, 10, 11, 14, 15, 13, 12, 16, 18]",0,0.0,int64
1,bathrooms,38,"[2.0, 4.0, 0.0, 1.0, 2.5, 3.5, 3.0, 5.5, 4.5, 5.0, 1.5, 7.5, 6.0, 10.0, 8.0, 7.0, 11.0...",0,0.0,float64
4,year_built,153,"[2005, 2011, 1926, 1972, 1973, 1960, 1950, 1951, 2016, 2013, 2014, 2015, 2012, 1944, 1...",0,0.0,int64
2,finished_area,10557,"[3633, 1620, 2077, 1200, 171, 203, 1244, 1300, 1222, 4144, 1821, 2684, 2284, 2342, 266...",0,0.0,int64
3,home_value,588178,"[296425, 847770, 646760, 5328, 6920, 14166, 169471, 233266, 290492, 1303522, 537949, 6...",0,0.0,int64
5,tax_amount,915868,"[6941.39, 10244.94, 7924.68, 91.6, 255.17, 163.79, 2532.88, 3110.99, 3870.25, 14820.1,...",0,0.0,float64


In [4]:
dummy_list = ['county']
dummy_df = pd.get_dummies(df[dummy_list], dtype=int, drop_first=True)

In [5]:

df_prepd = pd.concat([df, dummy_df], axis=1)

In [9]:
df_prepd.head()

Unnamed: 0,bedrooms,bathrooms,finished_area,home_value,year_built,tax_amount,county,county_Orange,county_Ventura
4,4,2.0,3633,296425,2005,6941.39,LA,0,0
6,3,4.0,1620,847770,2011,10244.94,LA,0,0
7,3,2.0,2077,646760,1926,7924.68,LA,0,0
11,0,0.0,1200,5328,1972,91.6,LA,0,0
14,0,0.0,171,6920,1973,255.17,LA,0,0


In [16]:
df_prepd.drop(columns=('county'), inplace=True)

In [17]:
train, validate, test = w.split_continuous(df_prepd)

Train: 0.5599997196569536
Validate: 0.24000028034304643
Test: 0.2


### 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.



In [18]:
train.shape

(1198531, 8)

In [19]:
train.head()

Unnamed: 0,bedrooms,bathrooms,finished_area,home_value,year_built,tax_amount,county_Orange,county_Ventura
1937068,4,2.5,2154,910679,2001,11079.6,1,0
1201703,3,2.0,1224,66845,1952,1198.53,0,0
751562,4,3.0,2358,247161,1972,3060.88,1,0
235077,3,2.0,1513,70917,1965,1119.54,1,0
1991181,3,3.0,2004,711410,2005,7652.36,0,1


In [22]:
# X and y splits to scale
target = "home_value"

X_train = train.drop(columns=[target])
y_train = train[target]

X_validate = validate.drop(columns=[target])
y_validate = validate[target]

X_test = test.drop(columns=[target])
y_test = test[target]

In [23]:
scaler = RobustScaler()

In [26]:
scaler.fit(X_train)

In [27]:
#use it
#make a new column within train
#use 'transform' instead of 'predict'
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [38]:
X_train_scaled = pd.DataFrame(X_train_scaled)



In [42]:
def rfe(X, y, num_feat):
    lm = LinearRegression()

    # create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
    rfe = RFE(lm, n_features_to_select=num_feat)

    # fit the data using RFE
    rfe.fit(X,y)  

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # get list of the column names. 
    rfe_feature = X.iloc[:,feature_mask].columns.tolist()

    print(f'The best {num_feat}  are {rfe_feature}')
    return rfe_feature

In [33]:
#now let's put together everything we learned above and make a function to do all of those steps together
def select_kbest(X, y, k=2):
    '''
    will take in two pandas objects:
    X: a dataframe representing numerical independent features
    y: a pandas Series representing a target variable
    k: a keyword argument defaulted to 2 for the number of ideal features we elect to select
    
    return: a list of the selected features from the SelectKBest process
    '''
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [40]:
select_kbest(X_train, y_train, 4)

Index(['bedrooms', 'bathrooms', 'finished_area', 'tax_amount'], dtype='object')

In [44]:
rfe(X_train,y_train, 4)

The best 4  are ['bedrooms', 'bathrooms', 'county_Orange', 'county_Ventura']


['bedrooms', 'bathrooms', 'county_Orange', 'county_Ventura']

### 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [47]:
baseline = y_train.mean()
baseline

459636.36542400654

In [45]:
def metrics_reg(y, yhat):
    """
    send in y_true, y_pred & returns RMSE, R2
    """
    rmse = mean_squared_error(y, yhat, squared=False)
    r2 = r2_score(y, yhat)
    return rmse, r2

In [52]:
baseline_array = np.repeat(baseline, len(X_train_scaled))
baseline_array[:5]

array([459636.36542401, 459636.36542401, 459636.36542401, 459636.36542401,
       459636.36542401])

In [50]:
rmse, r2 = metrics_reg(y_train, baseline_array)
rmse, r2

(676257.7795145244, 0.0)

In [51]:
metrics_df = pd.DataFrame(data=[
    {
        'model':'baseline',
        'rmse':rmse,
        'r2':r2
    }
    
])
metrics_df

Unnamed: 0,model,rmse,r2
0,baseline,676257.779515,0.0


In [53]:
#intial ML model
lr1 = LinearRegression()

#make it
rfe = RFE(lr1, n_features_to_select=1)

#fit it
rfe.fit(X_train, y_train)

#use it on train
X_train_rfe = rfe.transform(X_train)

#use it on validate
X_val_rfe = rfe.transform(X_validate)