In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('kc_house_data.csv')
df.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639


In [3]:
sf = df[['sqft_living', 'price']]
sf.head()

Unnamed: 0,sqft_living,price
0,1180,221900.0
1,2570,538000.0
2,770,180000.0
3,1960,604000.0
4,1680,510000.0


In [4]:
sf.sort_values(by = ['sqft_living', 'price'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [5]:
sf.reset_index(drop=True, inplace=True)

In [6]:
sf.head(2)

Unnamed: 0,sqft_living,price
0,290,142000.0
1,370,276000.0


##### Independent and dependent variable converted to np arrays

In [7]:
X = sf.iloc[:, 0:1].values
y = sf.iloc[:, -1].values

##### the 15th-order polynomial model using the 'sqft_living', use an L2 penalty of `1e-5`

In [8]:
l2_small_penalty = 1e-5

When there are many features and so few data points, the solution can become highly numerically unstable, which can sometimes lead to strange unpredictable results

rather than no regularization, a tiny amount of regularization (`l2_penalty=1e-5`)

regularization can also help with numerical stability

##### Polynomial Ridge Regression for Numpy Coefficients function

In [9]:
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.preprocessing import PolynomialFeatures

def polynomial_ridge_regression(x, y, deg, l2_penalty):
    poly_reg = PolynomialFeatures(degree=deg)
    X_poly = poly_reg.fit_transform(x)
    
    ridge = RidgeCV(alphas = [l2_penalty])
    ridge.fit(X_poly, y)
    


    

    model = ridge
    
    return model

In [10]:
def print_coefficients(model):    
    # Get the degree of the polynomial
    deg = len(model.coef_)-1

    # Get learned parameters as a list
    w = list(model.coef_)

    # Numpy has a nifty function to print out polynomials in a pretty way
    # (We'll use it, but it needs the parameters in the reverse order)
    print('Learned polynomial for degree ' + str(deg) + ':')
    w.reverse()
    print(np.poly1d(w))

In [11]:
model = polynomial_ridge_regression(X, y, 15, l2_small_penalty)


##### Print Coefficients

In [12]:
print_coefficients(model)

Learned polynomial for degree 15:
            15             14             13            12
-3.111e+59 x  - 2.361e+55 x  - 1.799e+51 x  - 1.38e+47 x 
              11             10             9             8
 - 1.065e+43 x  - 8.309e+38 x  - 6.551e+34 x - 5.292e+30 x
              7             6             5             4             3
 - 2.226e+31 x + 5.864e+27 x + 2.706e+25 x + 2.342e+22 x + 1.385e+19 x
              2
 + 6.959e+15 x + 3.011e+12 x


##### Ridge_Poly Scores

In [13]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

def score_ridge_poly_predictions(x, y, deg, l2_penalty):

 
    model = make_pipeline(PolynomialFeatures(deg), Ridge(l2_penalty))
    model.fit(x, y)
    
    ypred = model.predict(x)
    
  
    #Mean Squared Error
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y, ypred)
    print(f"MSE: ${mse:,.0f}")
    
    #Root Mean Squared Error
    rmse = np.sqrt(mse)
    print(f"RMSE: ${rmse:,.0f}")
    
    #R-squared
    from sklearn.metrics import r2_score
    rSquared = r2_score(y, ypred)
    print(f'R_squared: {rSquared:.3f}')

    
    return model

In [14]:
model = score_ridge_poly_predictions(X, y, 15, l2_small_penalty)
model

MSE: $60,275,745,719
RMSE: $245,511
R_squared: 0.553


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.0194288896779384e-18


Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=15, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [15]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

def test_score_ridge_poly_predictions(x, y, deg, l2_penalty):
    
    polynomial_features = PolynomialFeatures(degree= deg,
                                             include_bias=False)
    
    ridge_regression = Ridge(alpha=l2_penalty)
    
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", ridge_regression)])
    
    pipeline.fit(x, y)
    

    
    ypred = pipeline.predict(x)

    
  
    #Mean Squared Error
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y, ypred)
    print(f"MSE: ${mse:,.0f}")
    
    #Root Mean Squared Error
    rmse = np.sqrt(mse)
    print(f"RMSE: ${rmse:,.0f}")
    
    #R-squared
    from sklearn.metrics import r2_score
    rSquared = r2_score(y, ypred)
    print(f'R_squared: {rSquared:.3f}')

    
    return pipeline

In [16]:
testModel = test_score_ridge_poly_predictions(X, y, 15, l2_small_penalty)

MSE: $60,259,152,361
RMSE: $245,477
R_squared: 0.553


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.576250458690452e-18


##### kfold function (needs to be adjusted for pandas)

In [17]:
testModel = test_score_ridge_poly_predictions(X, y, 15, 1.21192264451e+14)

MSE: $60,277,719,215
RMSE: $245,515
R_squared: 0.553


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 2.3257993270555133e-18


##### Testing kfold with: scoring = 'neg_mean_squared_error'

In [134]:
X

array([[   290.],
       [   370.],
       [   380.],
       ..., 
       [ 10040.],
       [ 12050.],
       [ 13540.]])

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree=15)
X_poly = poly_reg.fit_transform(X)

poly_reg_2 = LinearRegression()
poly_reg_2.fit(X_poly,y)

# Poly Prediction
yPoly_2 = poly_reg_2.predict(poly_reg.fit_transform(X))

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=poly_reg_2, X = X, y = y,scoring='neg_mean_squared_error', cv = 10)

In [24]:
#convert score to positive number
score = -accuracies
score

array([  3.07492579e+10,   2.36700318e+10,   2.11115248e+10,
         2.57831236e+10,   2.81560314e+10,   3.62028842e+10,
         4.79567807e+10,   6.30281695e+10,   9.53494278e+10,
         4.12405095e+11])

In [27]:
#Convert MSE to RMSE
score = np.sqrt(score)
score

array([ 418.75370227,  392.23804014,  381.17981963,  400.7134155 ,
        409.63104205,  436.20013208,  467.96406564,  501.05300788,
        555.68608645,  801.36617197])

In [28]:
avg = score.mean()
var = score.std()
print(f"The Mean of  Mean Square Error : {avg}, std: {var}")

The Mean of  Mean Square Error : 476.4785483611584, std: 119.93496845915742


##### Tesing K_Fold scoring on Ridge_Polynomial

In [29]:
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

def score_polynomial_ridge_regression(x, y, deg, l2_penalty):
    poly_reg = PolynomialFeatures(degree=deg)
    X_poly = poly_reg.fit_transform(x)
    
    ridge = RidgeCV(alphas = [l2_penalty])
    ridge.fit(X_poly, y)
    
    scoreMSE = cross_val_score(estimator=ridge, X = x, y = y,scoring='neg_mean_squared_error', cv = 10)


    

    score = scoreMSE
    
    return score

In [36]:
mse = score_polynomial_ridge_regression(X, y, 15, 0.000000001)
mse

array([ -5.31139925e+17,  -8.86369196e+17,  -1.97551096e+18,
        -1.26421920e+18,  -2.49386585e+16,  -1.43285034e+16,
        -8.34309895e+16,  -5.40387157e+17,  -8.70387958e+18,
        -3.72058669e+17])

In [39]:
#convert score to positive number
score = -mse
#Convert MSE to RMSE
score = np.sqrt(score)
avg = score.mean()
var = score.std()
print(f"The Mean of  Root Mean Square Error : {avg:,.02f}, std: {var}")

The Mean of  Root Mean Square Error : 906,194,501.89, std: 786408169.1504933
