In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# data separation/transformation
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE # Recursive Feature Elimination¶

# statistics testing
import scipy.stats as stats

# system manipulation
import itertools
import os
import sys
sys.path.append("./util_")
import prepare_
import explore_

# other
import env
import warnings
warnings.filterwarnings("ignore")

# set the random seed
np.random.seed(95)

**Get data**

In [2]:
# This data is already been split and save
# This is only training data
train_scaled = pd.read_csv("./00_project_data/1-1_training_data.csv", index_col=0)
train_scaled = train_scaled.reset_index(drop=True)
train_scaled.head()

Unnamed: 0,tax_value,county,los_angeles,orange,ventura,bedrooms_scaled,bathrooms_scaled,sqr_feet_scaled,year_built_scaled,tax_amount_scaled
0,160657.0,Los Angeles,1,0,0,0.428571,0.428571,0.320064,0.405172,0.162429
1,243277.0,Orange,0,1,0,0.428571,0.142857,0.195839,0.448276,0.144717
2,285312.0,Los Angeles,1,0,0,0.428571,0.428571,0.411882,0.491379,0.174333
3,268523.0,Ventura,0,0,1,0.428571,0.285714,0.261252,0.706897,0.147458
4,339115.0,Los Angeles,1,0,0,0.428571,0.285714,0.382877,0.655172,0.217292


## Feature Selection

Use recursive feature elimination to select the top 4 features for predicting tax value.

In [3]:
# separate features from target
xtrain = train_scaled.drop(columns=["tax_value", "county", "tax_amount_scaled"])
ytrain = train_scaled.tax_value

In [6]:
# make a model object to use in RFE process.
linear_model = LinearRegression()

# MAKE the RFE object
rfe = RFE(linear_model, n_features_to_select=1)

# FIT the RFE object to the training data
rfe.fit(xtrain, ytrain)

rfe.ranking_

array([1, 3, 2, 5, 6, 4, 7])

In [8]:
# get a dataframe of all the columns orderd by importance
xtrain_selected = xtrain.iloc[:, rfe.ranking_ - 1]
xtrain_selected.head(3)

# More important to least important

Unnamed: 0,los_angeles,ventura,orange,bathrooms_scaled,sqr_feet_scaled,bedrooms_scaled,year_built_scaled
0,1,0,0,0.428571,0.320064,0.428571,0.405172
1,0,0,1,0.142857,0.195839,0.428571,0.448276
2,1,0,0,0.428571,0.411882,0.428571,0.491379


In [10]:
xtrain_selected.columns

Index(['los_angeles', 'ventura', 'orange', 'bathrooms_scaled',
       'sqr_feet_scaled', 'bedrooms_scaled', 'year_built_scaled'],
      dtype='object')

## Stats testing

In [9]:
# This data is already been split and save
# This is only training data
train = pd.read_csv("./00_project_data/01_original_clean_no_dummies_train.csv", index_col=0)
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,bedrooms,bathrooms,sqr_feet,tax_value,year_built,tax_amount,county
0,3,3.0,1601.0,160657.0,1947,3259.81,Los Angeles
1,3,1.0,980.0,243277.0,1952,2905.82,Orange
2,3,3.0,2060.0,285312.0,1957,3497.72,Los Angeles
3,3,2.0,1307.0,268523.0,1982,2960.6,Ventura
4,3,2.0,1915.0,339115.0,1976,4356.32,Los Angeles


**Questions**

- Is there a linear relationship between the square footage of the property and the tax value?
- Do the number of bedrooms or bathrooms impact the tax value?
- Is there any relationship between the year the property was built and the tax value?
- Does the county where the property is located affect the tax value?
- Are there any interactions or combinations of variables that affect the tax value?

## Check Independace

### 1. Is there a linear relationship between the square footage of the property and the tax value?

In [None]:
print("Null_Hyp: There is a linear relationship between the square footage of the property and the tax value?")
print("")
print("Alt_Hyp: There is no a linear relationship between the square footage of the property and the tax value?")


In [None]:
# plot the columns
sns.relplot(data=train, x="sqr_feet", y="tax_value")

