In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import wrangle as w
import model as m
import prepare as p
import preprocess as pp
import random

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## Select a dataset with a continuous target variable. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [2]:
df = w.clean_zillow()

df.head()

found data


Unnamed: 0,bedrooms,bathrooms,square_ft,tax_value,year,tax_amount,county
3,0,0.0,1535,2108,1953,174.21,Los Angeles
4,4,2.0,3633,296425,2005,6941.39,Los Angeles
6,3,4.0,1620,847770,2011,10244.94,Los Angeles
7,3,2.0,2077,646760,1926,7924.68,Los Angeles
8,0,0.0,1535,6730242,1953,80348.13,Los Angeles


In [3]:
df = df.drop(columns = ['tax_amount'])

## Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [4]:
train, val, test = p.split_data(df)

train.shape, val.shape, test.shape

((1503871, 6), (322258, 6), (322259, 6))

In [5]:
to_scale = ['bedrooms', 'bathrooms', 'square_ft', 'year']

train, val, test = w.scale_data(train, val, test, to_scale)

train.head()

Unnamed: 0,bedrooms,bathrooms,square_ft,tax_value,year,county
1911171,0.08,0.03125,0.001238,288951,0.674419,Los Angeles
818602,0.16,0.09375,0.002722,266330,0.8,Ventura
1795717,0.16,0.078125,0.002575,325747,0.967442,Orange
1973528,0.16,0.03125,0.001784,40740,0.511628,Los Angeles
1139959,0.16,0.078125,0.002726,369055,0.75814,Orange


In [6]:
X_train, y_train = pp.xy_split(train)
X_val, y_val = pp.xy_split(val)

y_train.head()

1911171    288951
818602     266330
1795717    325747
1973528     40740
1139959    369055
Name: tax_value, dtype: int64

In [7]:
X_train, X_val = pp.dummies(X_train, X_val)

X_train.head()

Unnamed: 0,bedrooms,bathrooms,square_ft,year,county_Los Angeles,county_Orange,county_Ventura
1911171,0.08,0.03125,0.001238,0.674419,1,0,0
818602,0.16,0.09375,0.002722,0.8,0,0,1
1795717,0.16,0.078125,0.002575,0.967442,0,1,0
1973528,0.16,0.03125,0.001784,0.511628,1,0,0
1139959,0.16,0.078125,0.002726,0.75814,0,1,0


---

In [8]:
y_train.median(), y_train.mean()

(327886.0, 461861.4319020714)

In [9]:
baselines = pd.DataFrame({'y_actual': y_train,
                         'y_mean': y_train.mean(),
                         'y_median': y_train.median()})

baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
1911171,288951,461861.431902,327886.0
818602,266330,461861.431902,327886.0
1795717,325747,461861.431902,327886.0
1973528,40740,461861.431902,327886.0
1139959,369055,461861.431902,327886.0


In [10]:
m.eval_model(baselines.y_actual, baselines.y_mean)

699755.3945711367

In [11]:
m.eval_model(baselines.y_actual, baselines.y_median)

712465.4578186603

---

In [12]:
lm = LinearRegression()

m.train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 594841.3116180093.
The validate RMSE is 588528.1648149351.


---

In [13]:
ll = LassoLars(alpha = .5)

m.train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 594837.035583792.
The validate RMSE is 588587.29556564.


---

In [14]:
poly = PolynomialFeatures()
X_train_second = poly.fit_transform(X_train)
X_val_second = poly.fit_transform(X_val)

In [15]:
lm = LinearRegression()

m.train_model(lm, X_train_second, y_train, X_val_second, y_val)

The train RMSE is 524051.31250572565.
The validate RMSE is 530415.8817480925.


---

In [16]:
tw = TweedieRegressor()

m.train_model(tw, X_train_second, y_train, X_val_second, y_val)

The train RMSE is 697536.0214151737.
The validate RMSE is 699950.3957296993.


---

In [17]:
rf = RandomForestRegressor()

m.train_model(rf, X_train.sample(2_000), y_train.sample(2_000), X_val.sample(2_000), y_val.sample(2_000))

The train RMSE is 559071.9297311963.
The validate RMSE is 840898.9637131119.


---

In [18]:
X_train_second

array([[1.      , 0.08    , 0.03125 , ..., 0.      , 0.      , 0.      ],
       [1.      , 0.16    , 0.09375 , ..., 0.      , 0.      , 1.      ],
       [1.      , 0.16    , 0.078125, ..., 1.      , 0.      , 0.      ],
       ...,
       [1.      , 0.16    , 0.0625  , ..., 0.      , 0.      , 0.      ],
       [1.      , 0.16    , 0.0625  , ..., 0.      , 0.      , 0.      ],
       [1.      , 0.16    , 0.0625  , ..., 0.      , 0.      , 0.      ]])

In [19]:
rf = RandomForestRegressor()

import random

# Sample a list randomly without replacement
sampled_list = random.choice(X_train_second)
sampled_list2 = random.choice(X_val_second)

# If you want to sample elements within the selected list, you can use random.sample() again:
sampled = random.sample(sampled_list, 2_000)
sampled2 = random.sample(sampled_list2, 2_000)

m.train_model(rf, sampled, y_train.sample(2_000), sampled2, y_val.sample(2_000))

TypeError: Population must be a sequence.  For dicts or sets, use sorted(d).

In [23]:
X_train.shape, y_train.shape, X_val.shape

((1503871, 7), (1503871,), (322258, 7))

In [24]:
xgbr = XGBRegressor()

m.train_model(xgbr, X_train.sample(2_000), y_train.sample(2_000), X_val, y_val)

The train RMSE is 177285.9005421985.
The validate RMSE is 762826.0860758873.
