# Import Libraries and Data

In [26]:
import helper

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
real_estate = pd.read_csv('../datasets/processed_train.csv')

In [29]:
def simple_interaction_feature(df, list_of_old_features):
    new_feature_val = 1
    for feature in list_of_old_features:
        new_feature_val = new_feature_val * df[feature]
    return new_feature_val

In [30]:
real_estate['garage_score'] = simple_interaction_feature(real_estate, ['garage_area','garage_qual','garage_cars','garage_cond'])

In [31]:
real_estate = real_estate[['overall_qual', 'year_built', 'year_remod/add', 'mas_vnr_area',
       'exter_qual', 'bsmt_qual', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area',
       'kitchen_qual', 'saleprice', 'garage_score']]

In [32]:
dummy_neighborhood = pd.read_csv('../datasets/dummies/neighborhood_dummies.csv')

In [33]:
real_estate = pd.concat([real_estate,dummy_neighborhood],axis=1)

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV

In [35]:
X = real_estate.drop(columns=['saleprice'])
y = real_estate['saleprice']

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   random_state=42,
                                                   test_size = .2)

In [38]:
from sklearn.preprocessing import StandardScaler

In [39]:
ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train),)

In [40]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,-0.081747,1.097701,0.990598,0.807516,1.016168,0.563143,0.656487,0.478981,-0.279707,0.723473,...,-0.160128,4.010403,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
1,1.322072,1.130806,1.038255,1.578313,1.016168,0.563143,1.929778,2.000532,0.936761,0.723473,...,-0.160128,4.010403,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
2,-0.081747,0.170771,-0.343799,-0.562790,-0.693079,0.563143,0.404919,0.474033,-0.283663,-0.782232,...,-0.160128,-0.249351,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
3,1.322072,0.865969,0.656999,3.148455,1.016168,0.563143,0.177624,0.001486,1.510380,0.723473,...,6.244998,-0.249351,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
4,-0.081747,0.137667,0.466370,1.492669,1.016168,-0.561771,-1.175110,-0.174173,0.703357,-0.782232,...,-0.160128,-0.249351,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,11.647603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,1.322072,1.163910,1.085912,0.122363,2.725415,1.688058,-0.314480,-0.510646,1.187967,2.229178,...,-0.160128,4.010403,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
1636,-0.081747,-0.524426,-1.344596,-0.140279,-0.693079,0.563143,0.298995,0.078182,-0.600142,-0.782232,...,-0.160128,-0.249351,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
1637,-0.081747,0.071458,0.704656,0.002461,-0.693079,-0.561771,-0.045257,0.357752,-0.376629,0.723473,...,-0.160128,-0.249351,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855
1638,-0.081747,-0.060961,-0.677398,0.162330,-0.693079,-0.561771,-1.274413,-1.685828,-1.013543,-0.782232,...,-0.160128,-0.249351,-0.288484,-0.119264,-0.240966,-0.212725,-0.269437,-0.136505,-0.147671,-0.085855


In [41]:
from sklearn.linear_model import LinearRegression

In [42]:
lasso = Lasso()
ridge = Ridge()

# Ridge

In [43]:
ridge_alphas = np.logspace(0,5,100)
ridge_cv = RidgeCV(alphas = ridge_alphas, scoring = 'r2', cv=5)
ridge_cv.fit(X_train, y_train);

In [44]:
ridge_cv;

In [45]:
ridge_cv.alpha_

148.4968262254465

In [46]:
ridge = Ridge()

In [47]:
ridge.fit(X_train, y_train)

Ridge()

In [36]:
scores = cross_val_score(ridge, X_train, y_train, cv=5)

In [37]:
scores.mean()

0.8245907919779585

In [47]:
X_test = ss.fit_transform(X_test)

In [48]:
predicts = ridge.predict(X_test)

In [49]:
r2_score(y_test,predicts)

0.8646298260099378

In [50]:
mean_squared_error(y_test,predicts,squared=False)

28360.25363372494

In [51]:
mean_squared_error(y_test,predicts)

804303986.1692086

# Lasso

In [52]:
lasso_alphas = np.logspace(-3,0,100)
lasso_cv = LassoCV(alphas=lasso_alphas,
                  cv = 5,
                  n_jobs=-1,
                  max_iter = 100_000)

lasso_cv.fit(X_train, y_train);

In [53]:
lasso_cv.alpha_

1.0

In [54]:
lasso_cv.score(X_train, y_train)

0.8424391744672053

In [55]:
lasso_cv.score(X_test, y_test)

0.8648631163607015

In [56]:
predicts = lasso_cv.predict(X_test)
r2_score(y_test,predicts)

0.8648631163607015

In [57]:
mean_squared_error(y_test,predicts,squared=False)

28335.8057576554

# Conclusion

In [25]:
list(zip(X_train.columns,ridge.coef_))

[(0, 14909.164995928902),
 (1, 2420.2324098655063),
 (2, 2887.6419301205224),
 (3, 4286.233403739601),
 (4, 6191.903737141519),
 (5, 6238.578967707001),
 (6, 1558.2243236014208),
 (7, 5551.367094958319),
 (8, 18833.784471085335),
 (9, 7251.374926900733),
 (10, 12893.671014968315),
 (11, -2130.6878108053047),
 (12, -1502.6285782915731),
 (13, -2931.025488483259),
 (14, -261.17707535609435),
 (15, 2341.1077010155145),
 (16, -1510.2127293316207),
 (17, 4187.016679940612),
 (18, -3440.872420099574),
 (19, -759.9379993411811),
 (20, -869.5595347662908),
 (21, 3412.846479628183),
 (22, -2639.8007847223066),
 (23, -541.5625789916048),
 (24, -1747.6452744219891),
 (25, -560.1819968359973),
 (26, -238.73913906779495),
 (27, -1722.4460681509715),
 (28, -911.4837065035784),
 (29, 4949.312595757364),
 (30, 8769.60137220074),
 (31, -3964.369710940551),
 (32, -950.4346380827681),
 (33, -89.48228783320556),
 (34, -2537.187990764177),
 (35, -1310.7443273049707),
 (36, 7710.188112812017),
 (37, 2128.14