In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load data into dataframes
df = pd.read_csv('Raw_Data/train.csv', index_col=0)
df.head(10)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,307000
8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,200000
9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,129900
10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
df['LotFrontage'].fillna(value=0, inplace=True)
df['MasVnrArea'].fillna(value=0, inplace=True)

df.drop(columns='GarageYrBlt', inplace=True)

In [4]:
# Insert dummy variables for categorical data
df = pd.get_dummies(df)

In [5]:
# Split train and testing data and separate X, y
train_df, test_df = train_test_split(df, random_state=1)

X_train = train_df.drop(columns='SalePrice')
X_test = test_df.drop(columns='SalePrice')

y_train = train_df['SalePrice'].values
y_test = test_df['SalePrice'].values

In [6]:
# Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# fit linear regression model
model = Ridge()
model.fit(X_train_scaled, y_train)

Ridge()

In [8]:
model.score(X_train_scaled, y_train)

0.9348390726503102

In [9]:
model.score(X_test_scaled, y_test)

0.8723834034368453

In [10]:
predicted = model.predict(X_test_scaled)

In [11]:
pct_diff = []
for i in range(len(predicted)):
    p = predicted[i]
    a = y_test[i]
    
    pct_diff.append(round((p - a)/a * 100))

In [12]:
pd.DataFrame({'predicted': predicted, 'actual': y_test, 'pct_diff': pct_diff}).head(50)

Unnamed: 0,predicted,actual,pct_diff
0,213591.805504,231500,-8.0
1,155481.392712,179500,-13.0
2,111881.950511,122000,-8.0
3,79963.638837,84500,-5.0
4,134055.535566,142000,-6.0
5,356884.651546,325624,10.0
6,310511.029754,285000,9.0
7,147286.660598,151000,-2.0
8,208231.915317,195000,7.0
9,248850.840578,275000,-10.0


In [51]:
# Match model coefficients with features
coef_df = pd.DataFrame({'Feature': X_train.columns.values, 
                        'Coef': model.coef_.round(),
                        'Coef_Abs': abs(model.coef_.round())})
coef_df

Unnamed: 0,Feature,Coef,Coef_Abs
0,MSSubClass,-1333.0,1333.0
1,LotFrontage,-303.0,303.0
2,LotArea,8159.0,8159.0
3,OverallQual,7551.0,7551.0
4,OverallCond,5896.0,5896.0
...,...,...,...
282,SaleCondition_AdjLand,-329.0,329.0
283,SaleCondition_Alloca,1455.0,1455.0
284,SaleCondition_Family,26.0,26.0
285,SaleCondition_Normal,2360.0,2360.0


In [53]:
# Sort by largest coefficients
coef_df.sort_values(by=['Coef_Abs'], ascending=False).head(50)

Unnamed: 0,Feature,Coef,Coef_Abs
123,RoofMatl_ClyTile,-19003.0,19003.0
15,GrLivArea,14857.0,14857.0
13,2ndFlrSF,11901.0,11901.0
100,Condition2_PosN,-8892.0,8892.0
2,LotArea,8159.0,8159.0
8,BsmtFinSF1,7943.0,7943.0
84,Neighborhood_StoneBr,7766.0,7766.0
11,TotalBsmtSF,7658.0,7658.0
3,OverallQual,7551.0,7551.0
25,GarageArea,7366.0,7366.0
