In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings

# deal with potential warnings
warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [4]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                  test.loc[:,'MSSubClass':'SaleCondition']))

In [6]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [7]:
train["SalePrice"] = np.log1p(train["SalePrice"])

In [8]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

data[skewed_feats] = np.log1p(data[skewed_feats])

In [9]:
data = pd.get_dummies(data, drop_first=True)

In [10]:
data = data.fillna(data.mean())
scaler = StandardScaler()
data[skewed_feats] = scaler.fit_transform(data[skewed_feats])
data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.419418,-0.020358,-0.103719,7,5,2003,2003,1.21942,0.783352,-0.362698,...,0,0,0,0,1,0,0,0,1,0
1,-1.120845,0.619103,0.146544,6,8,1976,1976,-0.80177,0.89257,-0.362698,...,0,0,0,0,1,0,0,0,1,0
2,0.419418,0.11844,0.457629,7,5,2001,2002,1.146941,0.658276,-0.362698,...,0,0,0,0,1,0,0,0,1,0
3,0.638691,-0.266348,0.136301,7,5,1915,1970,-0.80177,0.387039,-0.362698,...,0,0,0,0,1,0,0,0,0,0
4,0.419418,0.769612,0.922662,8,5,2000,2000,1.440386,0.758231,-0.362698,...,0,0,0,0,1,0,0,0,1,0


In [11]:
#creating matrices for sklearn:
x_train = data[:train.shape[0]]
x_test = data[train.shape[0]:]
y = pd.DataFrame(train.SalePrice)

#### Linear regression
#### It doesn't need a random state as sklearn uses closed-form solution instead of  gradient descent

In [12]:
model_Linear = LinearRegression()
model_Linear.fit(x_train, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
print('mean squared error:')
-np.mean(cross_val_score(model_Linear, x_train, y, cv=5, scoring='neg_mean_squared_error'))

mean squared error:


0.025433746615611586

In [14]:
y_pre = model_Linear.predict(x_test).reshape(len(x_test), )

#### Ridge

In [15]:
param_grid = {'alpha':np.arange(9.9, 10.1, 0.01)}
model_Ridge = Ridge(random_state=42)
Ridge_clf = GridSearchCV(model_Ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
Ridge_clf.fit(x_train, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 9.9 ,  9.91,  9.92,  9.93,  9.94,  9.95,  9.96,  9.97,  9.98,
        9.99, 10.  , 10.01, 10.02, 10.03, 10.04, 10.05, 10.06, 10.07,
       10.08, 10.09])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [16]:
print('Best alpha for Ridge regression is:', Ridge_clf.best_estimator_.alpha)
print('Score for best Ridge model is:', -Ridge_clf.best_score_)

Best alpha for Ridge regression is: 10.089999999999996
Score for best Ridge model is: 0.016426495697956345


In [None]:
y_pre = Ridge_clf.best_estimator_.predict(x_test).reshape(len(x_test), )

#### Lasso

In [17]:
param_grid = {'alpha':np.arange(0.00068, 0.00071, 0.000001)}
model_Lasso = Lasso(random_state=42)
Lasso_clf = GridSearchCV(model_Lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
Lasso_clf.fit(x_train, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00069,
       0.00069, 0.00069, 0.00069, 0.00069, 0.00069, 0.00069, 0.00069,
       0.00069, 0.0007 , 0.0007 , 0.0007 , 0.0007 , 0.0007 , 0.0007 ,
       0.0007 , 0.0007 , 0.0007 , 0.0007 , 0.00071, 0.00071, 0.00071,
       0.00071, 0.00071])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [18]:
print('Best alpha for Lasso regression is:', Lasso_clf.best_estimator_.alpha)
print('Score for best Lasso model is:', -Lasso_clf.best_score_)

Best alpha for Lasso regression is: 0.0007000000000000005
Score for best Lasso model is: 0.016143514666083943


In [None]:
y_pre = Lasso_clf.best_estimator_.predict(x_test).reshape(len(x_test), )

#### ElasticNet

In [32]:
model_ENet = ElasticNet(random_state=42)
param_grid = {'alpha':np.arange(0.0009, 0.0019, 0.00001)}
Enet_clf = GridSearchCV(model_ENet, param_grid, cv=5, scoring='neg_mean_squared_error')
Enet_clf.fit(x_train, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.0009 , 0.00091, ..., 0.00188, 0.00189])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [33]:
print('Best alpha for Enet regression is:', Enet_clf.best_estimator_.alpha)
print('Score for best Enet model is:', -Enet_clf.best_score_)

Best alpha for Enet regression is: 0.0011800000000000007
Score for best Enet model is: 0.01610486259429746


In [None]:
y_pre = Enet_clf.best_estimator_.predict(x_test).reshape(len(x_test), )

#### submission

In [21]:
test_id = pd.read_csv('test.csv').Id
price = np.expm1(y_pre)
solution = pd.DataFrame({"id":test_id, "SalePrice":price})
solution.to_csv("final_submission.csv", index = False)