In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling as pp
import scipy.stats as stats
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt


%matplotlib inline

In [2]:
#I want to experiment with running some of the variables that have high correlations to saleprice.

In [3]:
pd.set_option('display.max_columns', 999)

In [4]:
#imported both data sets

In [5]:
train = pd.read_csv('../datasets/train_clean.csv')

In [6]:
test = pd.read_csv('../datasets/test_clean.csv')

In [7]:
features =['overall_qual','gr_liv_area','garage_area','garage_cars','tot_bath','total_bsmt_sf','1st_flr_sf',
           'house_age',]
X_train = train[features]
X_test = test[features]
y_train_overfit = train['saleprice']

In [8]:
#Create PolynomialFeatures

In [9]:
poly= PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

#instatiate a model

In [10]:
X_train_overfit = poly.fit_transform(X_train)
X_test_overfit = poly.fit_transform(X_test)

In [11]:
ss_overfit = StandardScaler()
Z_train_overfit = pd.DataFrame(ss_overfit.fit_transform(X_train_overfit))
Z_test_overfit = pd.DataFrame(ss_overfit.fit_transform(X_test_overfit))

In [12]:
lr=LinearRegression()

In [13]:
lr.fit(X_train_overfit, y_train_overfit)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
predictions = lr.predict(X_train_overfit)

In [15]:
r2_score(y_train_overfit, predictions)

0.8838137050186942

In [16]:
#create Ridge Model

In [17]:
ridge_mod = Ridge(alpha=10)

In [18]:
ridge_mod.fit(X_train_overfit, y_train_overfit)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [19]:
ridge_predictions = ridge_mod.predict(X_train_overfit)

In [20]:
r2_score(y_train_overfit, ridge_predictions)

0.8836536063268811

In [21]:
r_alph = np.logspace(0, 10, 200)

In [22]:
ridge_mod = RidgeCV(alphas=r_alph, scoring='r2', cv=5)

In [23]:
ridge_mod = ridge_mod.fit(X_train_overfit, y_train_overfit)

In [24]:
ridge_mod.alpha_

5.052631065335681

In [25]:
y_train_ridge_pred = ridge_mod.predict(X_train_overfit)
y_test_ridge_pred = ridge_mod.predict(X_test_overfit)

In [26]:
r2_score(y_train_overfit, y_train_ridge_pred)

0.8837343473048963

In [27]:
#create Lasso model

In [28]:
las_alph = np.logspace(-3, 0, 100)

In [29]:
las_mod = LassoCV(alphas=las_alph, cv=5)

In [30]:
las_mod = las_mod.fit(X_train_overfit, y_train_overfit)























In [31]:
# Generate predictions
y_train_las_pred = las_mod.predict(X_train_overfit)
y_test_las_pred = las_mod.predict(X_test_overfit)

# Evaluate model.
print(r2_score(y_train_overfit, y_train_las_pred))

0.8833489335309317


In [32]:
las_mod.alpha_

0.9326033468832199

In [33]:
r2_score(y_train_overfit, y_train_ridge_pred)

0.8837343473048963

In [34]:
ridge_mod.coef_

array([-1.76537682e+04,  1.62636096e+00, -2.43062957e+01, -1.00680566e+04,
        2.67507813e+03, -4.53596947e+00, -1.42124409e+01, -3.82157484e+01,
       -9.92092924e+02,  9.65518073e+00,  1.41139316e+01,  2.55035653e+02,
        9.65687945e+02,  1.12890051e+01,  8.43498687e+00,  5.11619359e+01,
        4.00283754e-03, -2.11468068e-02,  1.00459584e+01,  1.04976289e+01,
        1.71158276e-02, -6.68341344e-02,  6.65028414e-03,  4.51641087e-02,
        3.27215541e-01, -1.05779448e+00, -3.51402687e-02, -1.60506546e-03,
       -3.76803798e-01, -4.61342406e+03, -4.25920430e+03,  8.83085464e+00,
        8.51134337e+00,  1.23020304e+02, -4.37619521e+03,  7.51110615e+00,
        1.24347360e+01, -2.19343288e+02, -3.39064229e-03, -5.48247359e-02,
        2.55928954e-02,  3.77093650e-02, -1.13796078e-02, -1.28839486e+00])

In [35]:
las_mod.coef_

array([-1.63527849e+04,  1.18519917e+00,  4.23624758e+00, -2.22172937e+04,
        3.62300287e+02, -3.67513182e+01,  1.19614297e+01, -2.01170723e+02,
       -1.37723260e+03,  1.05950099e+01,  1.13565323e+01,  1.79848894e+03,
        1.18095142e+03,  1.55204719e+01,  4.52885889e+00,  5.56162273e+01,
        3.47195559e-03, -6.17039106e-03,  4.57925846e+00,  1.04528767e+01,
       -1.77831497e-03, -4.89348646e-02, -3.25226318e-02,  5.49479539e-02,
       -7.70136058e+00, -1.01152760e+01, -2.70634382e-02, -1.06964777e-02,
       -5.57574906e-01, -3.28459528e+03, -1.77313694e+03,  8.17191631e+00,
        1.02670120e+01,  1.97706212e+02, -4.47533961e+03,  1.40098867e+01,
        6.99162990e+00, -2.00324952e+02, -8.41730944e-03, -3.77664899e-02,
        2.33600037e-01,  2.58557708e-02, -1.57934833e-01, -5.73136662e-01])

In [36]:
#created a new dataframe for me submission 

In [37]:
submission = pd.DataFrame(columns=[])

In [38]:
#assigned data to dataframe

In [39]:
#submission = pd.DataFrame(columns=[test['id']
submission['Id'] = test['id']
submission['SalePrice'] = y_test_ridge_pred

In [40]:
submission.head()

Unnamed: 0,Id,SalePrice
0,2658,165910.901551
1,2718,151550.397228
2,2414,167991.155028
3,1989,119027.650644
4,625,183622.386421


In [41]:
submission.to_csv('../submissions/submission_twelve.csv',index=False)

In [42]:
submission['SalePrice'] = y_test_las_pred

In [43]:
submission.head()

Unnamed: 0,Id,SalePrice
0,2658,166197.343332
1,2718,149195.569527
2,2414,168594.878147
3,1989,120596.898096
4,625,184783.149324


In [44]:
submission.to_csv('../submissions/submission_thirteen.csv',index=False)