In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std



plt.rc("axes.spines", top=False, right=False)

#import our scripts that do data science workflow
import wrangle
import split_scale


In [8]:
from env import host, user, password

Wrangle and Clean

In [9]:
df=wrangle.get_data_from_mysql()

In [10]:
df = df.drop('propertylandusedesc', axis =1)

In [11]:
df = df.rename(columns={"bedroomcnt": "bedrooms", "bathroomcnt": "bathrooms", "calculatedfinishedsquarefeet": "squarefeet", 'taxvaluedollarcnt':'tax'})

In [12]:
df= pd.DataFrame(df)

In [13]:
df.head()

Unnamed: 0,id,bedrooms,bathrooms,tax,propertylandusetypeid,squarefeet
0,6926,3.0,1.0,305481.0,261.0,1136.0
1,31568,3.0,2.0,418694.0,261.0,2104.0
2,31569,2.0,1.0,169843.0,261.0,872.0
3,31570,3.0,1.0,194005.0,261.0,1190.0
4,31571,3.0,2.0,288259.0,261.0,1534.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16035 entries, 0 to 16034
Data columns (total 6 columns):
id                       16035 non-null int64
bedrooms                 16035 non-null float64
bathrooms                16035 non-null float64
tax                      16033 non-null float64
propertylandusetypeid    16035 non-null float64
squarefeet               16035 non-null float64
dtypes: float64(5), int64(1)
memory usage: 751.7 KB


Split my data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df[['bedrooms', 'bathrooms','squarefeet']]
y = df[['tax']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=101)
train, test = train_test_split(df, train_size = .90, random_state = 123)

In [None]:
print(train.shape); print(test.shape)

In [None]:
#from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

In [None]:
#scaler = StandardScaler(copy=True, with_mean=True, with_std=True)\
            .fit(train) # fit the object

In [None]:
#scaler, train_scaled, test_scaled = split_scale.standard_scaler(train,test)

DATA EXPLORATION

In [None]:
import statsmodels.api as sm

#OLS object to analyze features

ols_model = sm.OLS(y_train,X_train)
fit = ols_model.fit()
fit.summary()

In [None]:
sns.pairplot(df)

In [None]:
with sns.axes_style('white'):
    square = sns.jointplot("squarefeet", "tax", data=df, kind='reg', height=5);
plt.show()
with sns.axes_style('white'):
    bedroom = sns.jointplot("bedrooms", "tax", data=df, kind='reg', height=5);
plt.show()
with sns.axes_style('white'):
    bathroom = sns.jointplot("bathrooms", "tax", data=df, kind='reg', height=5);
plt.show()

In [None]:
df_bedrooms = (df['bedrooms'] < 5) & (df['tax'] < 1000000)
g = sns.lmplot(x='bathrooms',y='tax',hue='bedrooms',data=df)
plt.xlim(0,10)

Model and Evaluate

In [None]:
from sklearn.linear_model import LinearRegression
#pick model type, Linear Regression for the baseline
lm1 = LinearRegression()

In [None]:
#fit and train model
lm1.fit(X_train,y_train)
lm1_y_intercept = lm1.intercept_
lm1_coefficients = lm1.coef_

print('{} = b + m1 * {} + m2 * {}'.format(y_train.columns[0], X_train.columns[0],X_train.columns[1]))
print('    y-intercept  (b): %.2f' % lm1_y_intercept)
print('    coefficient (m1): %.2f' % lm1_coefficients[0][0])
print('    coefficient (m2): %.2f' % lm1_coefficients[0][1])

In [None]:
#target prediction

y_pred_lm1 = lm1.predict(X_train)

In [None]:
y_pred_lm1

In [None]:
predictions = lm1.predict( X_test)

In [None]:
plt.scatter(y_test,predictions)

plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [None]:
mse_lm1 = mean_squared_error(y_train,y_pred_lm1)
print("linear model\n  mean squared error: {:.3}".format(mse_lm1)) 

r2_lm1 = r2_score(y_train,y_pred_lm1)
print('  {:.2%} of the variance in the house price can be explained by the num of bathrooms, num of bedrooms, and sq ft of house.'.format(r2_lm1))

In [None]:
#establish baseline
from math import sqrt
y_pred_baseline = np.array([y_train.mean()[0]]*len(y_train))
MSE = mean_squared_error(y_train,y_pred_baseline)
SSE = MSE * len(y_train)
RMSE = sqrt(MSE)


evs = explained_variance_score(y_train,y_pred_baseline)


In [None]:
print
(MSE,
SSE,
RMSE)

In [None]:
#Apply Test Data
model=lm1.predict(X_test[['bedrooms','bathrooms', 'squarefeet']])
model=model.ravel()
y_test1=np.array(y_test).ravel()
best_model=pd.DataFrame({'predictions':model,'tax':y_test1})

best_model.head()

In [None]:
#Write a function that creates the model object, fits and predicts, given X_train, X_test, y_train, y_test
X_train1=X_train[['bedrooms','bathrooms', 'squarefeet']]
X_test1=X_test[['bedrooms','bathrooms', 'squarefeet']]
def modeling_function(X_train,X_test,y_train,y_test):
    predictions_train=pd.DataFrame({'actual':y_train.tax}).reset_index(drop=True)
    predictions_test=pd.DataFrame({'actual':y_test.tax}).reset_index(drop=True)
    #model 1
    lm1=LinearRegression()
    lm1.fit(X_train,y_train)
    lm1_predictions=lm1.predict(X_train)
    predictions_train['lm1']=lm1_predictions

    #model 2
    #lm2=LinearRegression()
    #lm2.fit(X_test,y_test)
    #lm2_predictions=lm2.predict(X_test)
    #predictions_test['lm2']=lm2_predictions
    
    return predictions_train,predictions_test

In [None]:
model_train,model_test=modeling_function(X_train1,X_test1,y_train,y_test)

In [None]:
print(model_test.head())
print(model_train.head())

In [None]:
##) plot_residuals(x, y, dataframe) that takes the feature, the target, and the dataframe as input and returns a residual plot.
def plot_residuals(x, y):
    '''
    Plots the residuals of a model that uses x to predict y. Note that we don't
    need to make any predictions ourselves here, seaborn will create the model
    and predictions for us under the hood with the `residplot` function.
    '''
    return sns.residplot(x, y)

x=test[['squarefeet']]
y=test[['tax']]
plot_residuals(x,y)

In [None]:
##plot_regression(x, y) that takes a feature and a target and returns the datapoints, the regression line, and the confidence interval.
res = sm.OLS(y, x).fit()

In [None]:
res.summary()