In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load data into dataframes
df = pd.read_csv('resources/train.csv', index_col=0)
df.head(10)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,307000
8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,200000
9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,129900
10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
df['LotFrontage'].fillna(value=0, inplace=True)
df['MasVnrArea'].fillna(value=0, inplace=True)

df.drop(columns='GarageYrBlt', inplace=True)

In [4]:
# Insert dummy variables for categorical data
df_num = pd.get_dummies(df)

In [5]:
# Split train and testing data and separate X, y
train_df, test_df = train_test_split(df_num, random_state=42)

X_train = train_df.drop(columns='SalePrice')
X_test = test_df.drop(columns='SalePrice')

y_train = train_df['SalePrice'].values
y_test = test_df['SalePrice'].values

In [6]:
# Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# fit linear regression model
model = Ridge(alpha=20)
model.fit(X_train_scaled, y_train)

Ridge(alpha=20)

In [8]:
model.score(X_train_scaled, y_train)

0.9355823690264344

In [9]:
model.score(X_test_scaled, y_test)

0.8986651721073147

In [10]:
predicted = model.predict(X_test_scaled)

In [11]:
pct_diff = []
for i in range(len(predicted)):
    p = predicted[i]
    a = y_test[i]
    
    pct_diff.append(round((p - a)/a * 100))

In [12]:
pd.DataFrame({'predicted': predicted, 'actual': y_test, 'pct_diff': pct_diff}).head(50)

Unnamed: 0,predicted,actual,pct_diff
0,158004.920094,154500,2.0
1,349711.527694,325000,8.0
2,83647.727158,115000,-27.0
3,182029.517831,159000,14.0
4,322763.700252,315500,2.0
5,67916.845469,75500,-10.0
6,235644.679645,311500,-24.0
7,146776.289348,146000,1.0
8,59080.170064,84500,-30.0
9,150485.460598,135500,11.0


In [13]:
# Match model coefficients with features
coef_df = pd.DataFrame({'Feature': X_train.columns.values, 
                        'Coef': model.coef_.round(),
                        'Coef_Abs': abs(model.coef_.round())})
coef_df

Unnamed: 0,Feature,Coef,Coef_Abs
0,MSSubClass,-1439.0,1439.0
1,LotFrontage,346.0,346.0
2,LotArea,6516.0,6516.0
3,OverallQual,9361.0,9361.0
4,OverallCond,5641.0,5641.0
...,...,...,...
282,SaleCondition_AdjLand,55.0,55.0
283,SaleCondition_Alloca,1425.0,1425.0
284,SaleCondition_Family,-776.0,776.0
285,SaleCondition_Normal,-164.0,164.0


In [14]:
# Sort by largest coefficients
coef_df.sort_values(by=['Coef_Abs'], ascending=False).head(50)

Unnamed: 0,Feature,Coef,Coef_Abs
123,RoofMatl_ClyTile,-17206.0,17206.0
15,GrLivArea,12511.0,12511.0
13,2ndFlrSF,10555.0,10555.0
3,OverallQual,9361.0,9361.0
11,TotalBsmtSF,8717.0,8717.0
100,Condition2_PosN,-8144.0,8144.0
5,YearBuilt,7812.0,7812.0
8,BsmtFinSF1,7525.0,7525.0
248,GarageQual_Ex,7429.0,7429.0
2,LotArea,6516.0,6516.0


In [15]:
# tune alpha
alpha = [1, 5, 10, 20, 50, 100, 1000, 10000]

for a in alpha:
    model = Ridge(alpha=a)
    model.fit(X_train_scaled, y_train)
    
    print(f'alpha: {a}')
    print(f'model score: {model.score(X_test_scaled, y_test)}')
    print('---------------------------------------')

alpha: 1
model score: 0.8925199599683801
---------------------------------------
alpha: 5
model score: 0.8971931607876975
---------------------------------------
alpha: 10
model score: 0.897990554442712
---------------------------------------
alpha: 20
model score: 0.8986651721073147
---------------------------------------
alpha: 50
model score: 0.8984116890185245
---------------------------------------
alpha: 100
model score: 0.89622214040623
---------------------------------------
alpha: 1000
model score: 0.86653235783008
---------------------------------------
alpha: 10000
model score: 0.6703865716342648
---------------------------------------


## Simplified Model

In [16]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'Ope

In [17]:
# columns to keep
cols = ['GrLivArea', 'Neighborhood', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterQual', 'TotalBsmtSF', 
       'BsmtFinSF1', 'LotArea', 'TotRmsAbvGrd', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual', 
        'BldgType', 'HouseStyle', 'GarageCars', 'SalePrice']

In [18]:
df_simple = df[cols]
df_simple.head()

Unnamed: 0_level_0,GrLivArea,Neighborhood,OverallQual,OverallCond,YearBuilt,ExterQual,TotalBsmtSF,BsmtFinSF1,LotArea,TotRmsAbvGrd,FullBath,HalfBath,BedroomAbvGr,KitchenQual,BldgType,HouseStyle,GarageCars,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1710,CollgCr,7,5,2003,Gd,856,706,8450,8,2,1,3,Gd,1Fam,2Story,2,208500
2,1262,Veenker,6,8,1976,TA,1262,978,9600,6,2,0,3,TA,1Fam,1Story,2,181500
3,1786,CollgCr,7,5,2001,Gd,920,486,11250,6,2,1,3,Gd,1Fam,2Story,2,223500
4,1717,Crawfor,7,5,1915,TA,756,216,9550,7,1,0,3,Gd,1Fam,2Story,3,140000
5,2198,NoRidge,8,5,2000,Gd,1145,655,14260,9,2,1,4,Gd,1Fam,2Story,3,250000


In [19]:
# Change quality categorical data to numerical (eg. 0-5)
qual = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

df_simple['ExterQual'] = [qual[x] if x is not np.nan else 1 for x in df_simple['ExterQual']]
df_simple['KitchenQual'] = [qual[x] if x is not np.nan else 1 for x in df_simple['KitchenQual']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [20]:
# Combine full bath and half bath
df_simple['Bath'] = df_simple['FullBath'] + df_simple['HalfBath'] * 0.5

df_simple.drop(columns=['FullBath', 'HalfBath'], inplace=True)

df_simple

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,GrLivArea,Neighborhood,OverallQual,OverallCond,YearBuilt,ExterQual,TotalBsmtSF,BsmtFinSF1,LotArea,TotRmsAbvGrd,BedroomAbvGr,KitchenQual,BldgType,HouseStyle,GarageCars,SalePrice,Bath
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1710,CollgCr,7,5,2003,4,856,706,8450,8,3,4,1Fam,2Story,2,208500,2.5
2,1262,Veenker,6,8,1976,3,1262,978,9600,6,3,3,1Fam,1Story,2,181500,2.0
3,1786,CollgCr,7,5,2001,4,920,486,11250,6,3,4,1Fam,2Story,2,223500,2.5
4,1717,Crawfor,7,5,1915,3,756,216,9550,7,3,4,1Fam,2Story,3,140000,1.0
5,2198,NoRidge,8,5,2000,4,1145,655,14260,9,4,4,1Fam,2Story,3,250000,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,1647,Gilbert,6,5,1999,3,953,0,7917,7,3,3,1Fam,2Story,2,175000,2.5
1457,2073,NWAmes,6,6,1978,3,1542,790,13175,7,3,3,1Fam,1Story,2,210000,2.0
1458,2340,Crawfor,7,9,1941,5,1152,275,9042,9,4,4,1Fam,2Story,1,266500,2.0
1459,1078,NAmes,5,6,1950,3,1078,49,9717,5,2,4,1Fam,1Story,1,142125,1.0


In [21]:
# Insert dummy variables for categorical data
df_simple_num = pd.get_dummies(df_simple)

# Split train and testing data and separate X, y
train_df, test_df = train_test_split(df_simple_num, random_state=42)

X_train = train_df.drop(columns='SalePrice')
X_test = test_df.drop(columns='SalePrice')

y_train = train_df['SalePrice'].values
y_test = test_df['SalePrice'].values

In [22]:
# Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# fit linear regression model
model = Ridge(alpha=1)
model.fit(X_train_scaled, y_train)

Ridge(alpha=1)

In [24]:
model.score(X_train_scaled, y_train)

0.8411858205687788

In [25]:
model.score(X_test_scaled, y_test)

0.864767217574047

In [26]:
predicted = model.predict(X_test_scaled)

pct_diff = []
for i in range(len(predicted)):
    p = predicted[i]
    a = y_test[i]
    
    pct_diff.append(round((p - a)/a * 100))

pd.DataFrame({'predicted': predicted, 'actual': y_test, 'pct_diff': pct_diff}).head(50)

Unnamed: 0,predicted,actual,pct_diff
0,147950.026535,154500,-4.0
1,331210.136027,325000,2.0
2,110928.002216,115000,-4.0
3,181445.778118,159000,14.0
4,287775.770297,315500,-9.0
5,71874.879156,75500,-5.0
6,251532.971742,311500,-19.0
7,132255.738264,146000,-9.0
8,57572.970013,84500,-32.0
9,131655.632758,135500,-3.0


In [27]:
# Match model coefficients with features
coef_df = pd.DataFrame({'Feature': X_train.columns.values, 
                        'Coef': model.coef_.round(),
                        'Coef_Abs': abs(model.coef_.round())})

# Sort by largest coefficients
coef_df.sort_values(by=['Coef_Abs'], ascending=False)

Unnamed: 0,Feature,Coef,Coef_Abs
0,GrLivArea,24725.0,24725.0
1,OverallQual,16937.0,16937.0
29,Neighborhood_NridgHt,10753.0,10753.0
3,YearBuilt,10421.0,10421.0
35,Neighborhood_StoneBr,7683.0,7683.0
28,Neighborhood_NoRidge,7409.0,7409.0
6,BsmtFinSF1,7338.0,7338.0
11,GarageCars,6930.0,6930.0
38,BldgType_1Fam,5998.0,5998.0
10,KitchenQual,5864.0,5864.0
