In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Summary

This notebook takes in a cleaned set of data for each of both training and test data sets. It then returns a RidgeCV regression model. 

In [5]:
train = pd.read_csv('../datasets/train_final.csv')

In [8]:
test = pd.read_csv('../datasets/test_final.csv')

In [12]:
train.head()

Unnamed: 0,id,lot_area,year_built,year_remod/add,bedroom_abvgr,kitchen_abvgr,fireplaces,wood_deck_sf,open_porch_sf,enclosed_porch,...,garage_type_Detchd,garage_type_None,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,109,13517,1976,2005,3,1,0,0,44,0,...,0,0,0,0,0,0,0,0,0,1
1,544,11492,1996,1997,4,1,1,0,74,0,...,0,0,0,0,0,0,0,0,0,1
2,153,7922,1953,2007,3,1,0,0,52,0,...,1,0,0,0,0,0,0,0,0,1
3,318,9802,2006,2007,3,1,0,100,0,0,...,0,0,0,0,0,0,0,0,0,1
4,255,14235,1900,1993,3,1,0,0,59,0,...,1,0,0,0,0,0,0,0,0,1


In [10]:
test.head()

Unnamed: 0,id,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,...,garage_type_Detchd,garage_type_None,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,2658,69.0,9142,1,10,3,6,8,1910,1950,...,1,0,0,0,0,0,0,0,0,1
1,2718,69.545961,9662,2,10,3,5,4,1977,1977,...,0,0,0,0,0,0,0,0,0,1
2,2414,58.0,17104,2,10,3,7,5,2006,2006,...,0,0,0,0,0,0,0,1,0,0
3,1989,60.0,8520,1,10,3,5,6,1923,2006,...,1,0,0,0,0,0,0,0,0,1
4,625,69.545961,9500,2,10,3,6,5,1963,1963,...,0,0,0,0,0,0,0,0,0,1


In [13]:
train.shape

(2048, 158)

In [14]:
test.shape

(878, 157)

In [20]:
features = train.drop(columns = ['saleprice', 'id'])

### Create X and y

In [36]:
X = features
y = train['saleprice']

Final_test = test[features.columns]

In [37]:
X.shape

(2048, 156)

In [38]:
X_test.shape

(512, 156)

### Train/Test/Split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Scale
* Scaling training and testing data

In [40]:
ss = StandardScaler()
ss.fit(X_train)
Z_train = ss.transform(X_train)
Z_test = ss.transform(Final_test)

### Instantiate Ridge and Lasso Models

In [41]:
lasso = LassoCV(n_alphas = 200)

In [42]:
ridge = RidgeCV(alphas = np.linspace(.1, 10, 100))

### Cross Validation
* evaluate ridge verse lasso

In [43]:
lasso_scores = cross_val_score(lasso, X_train, y_train, cv = 3)
lasso_scores.mean()

0.7898380435133343

In [44]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv = 3)
ridge_scores.mean()

0.8995813216038742

### Fit for Ridge CV
* Since R2 for Ridge is outperforming lasso I will move forward with Ridge. 

In [45]:
ridge.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
        2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
        3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
        4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
        5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
        6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
        7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
        8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
       10. ]))

In [46]:
ridge.score(X_train, y_train)

0.9237802036974629

In [47]:
ridge.score(X_test, y_test)

0.8898957789648204

Ridge scored slightly lower R2 on test set than training set indicating potential overfit. 

### Regression Metrics on Ridge Model

In [None]:
def r2_adj(y, resids): # Take in the dependent variable and the residuals and return R-squared. 
    #find null prediction
    null_pred = y.mean()
    
    # find null residuals
    null_resids = y - null_pred
    
    #find null Sum Squared Error
    null_sse = (null_resids ** 2).sum()
    
    # find SSE
    sse = (resids ** 2).sum()
    
    #Calculate R squared
    r_sq = 1 - sse / null_sse
    
    n = len(sac_mlr)
    k = len(X_mlr.columns)
    
    r_sq_adj = 1 - (1- r_sq) * (n - 1) / (n - k - 1)
    
    return r_sq_adj

In [74]:
# from sklearn.metrics import mean_absolute_error
# from sklearn.metrics import mean_squared_log_error
# from sklearn.metrics import mean_squared_error


def regression_metrics(lr_instant, X, y):
    # Make sure you have a linear regression instantiated and call it under lr_instant
    # Make sure resids are created. example: (y - y_pred)
    # Make sure y_pred is created. example: lr.predict(X)
    
    
    y_pred = lr_instant.predict(X)
    resids = y - y_pred
    
    #define n
    n = len(X)
    k = len(X.columns)
    
    # r squared
    r_sq = r2_score(y_train, y_pred)
    
    # mean squared error
    mse = mean_squared_error(y, y_pred)
    
    # root of mean sqaured error
    rmse = mse ** .5
    
    #mean absolute error
    mae = mean_absolute_error(y, y_pred).round()
    
    # adjusted r squared
    # this formula taken from above
    r_sq_adj = 1 - (1- r_sq) * (n - 1) / (n - k - 1)
    
    print(f'y Predictor: {y_pred}')
    print(f'residuals: {resids}')
    print(f'Coefficient of Determination (R Squared): {r_sq}')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root of Mean Squared Error (RMSE): {rmse}')
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'Adjusted R Squared: {r_sq_adj}')
    print(f'Null Mean: {y.mean()}')

In [75]:
regression_metrics(ridge, X_train, y_train)

y Predictor: [120650.29852263 140393.89088135 195719.51835421 ... 138984.32739693
 116173.61244255 141517.28152415]
residuals: 415     10349.701477
273    -11393.890881
759    -22719.518354
250    -17069.782888
413      4411.883599
            ...     
1130   -25687.898573
1294   -30058.640663
860     -6484.327397
1459    22826.387557
1126      482.718476
Name: saleprice, Length: 1536, dtype: float64
Coefficient of Determination (R Squared): 0.9237802036974629
Mean Squared Error (MSE): 500979834.9110214
Root of Mean Squared Error (RMSE): 22382.578826199213
Mean Absolute Error (MAE): 15803.0
Adjusted R Squared: 0.9151578046958706
Null Mean: 181974.43294270834


### Interpret Ridge Regression Metrics

* The adjusted Coefficient of Determination only false slightly in this model indicating that the distances of the residuals squared is not too far from predicted values. 
* My RMSE score indicates a score of 22382 which would be an improvement upon the 28000 score submitted through Kaggle. This indicates that my training data is performing better than my testing data. Likely an overfit issue.  

# Test Ridge on Final Testing Data
Final_test is the test data

In [49]:
pred = ridge.predict(Final_test)

In [50]:
test['saleprice'] = pred

In [51]:
submission = test[['id','saleprice']]

In [52]:
submission.head()

Unnamed: 0,id,saleprice
0,2658,137621.019125
1,2718,166726.05643
2,2414,215162.839535
3,1989,99748.479613
4,625,178277.127014


In [56]:
submission.to_csv('../datasets/a5_sub_ridge.csv', index = False )

In [76]:
submission.to_csv('../datasets/a5_sub_ridge2.csv', index = False )