In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import sem
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

# Read CSV file

In [2]:
data = "Data/train.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Clean Data

In [3]:
df = df.drop(columns=["FireplaceQu", "Fence", "Alley", "MiscFeature", "PoolQC"]).dropna()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [4]:
df = pd.get_dummies(df, prefix_sep='_', drop_first=True)
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,0,1,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,0,1,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,0,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,0,1,0,0,0,1,0


In [5]:
X = df.drop(columns = ["SalePrice","Id"])
Y = df["SalePrice"]

In [12]:
rf = RandomForestRegressor()
rf = rf.fit(X, Y)
rf.score(X, Y)

0.979012962640126

In [13]:
srtd_fi = sorted(zip(rf.feature_importances_, X.columns), reverse=True)
srtd_fi

[(0.6096068011663658, 'OverallQual'),
 (0.08672944226165362, 'GrLivArea'),
 (0.04025351937732423, '2ndFlrSF'),
 (0.0266631618715602, '1stFlrSF'),
 (0.026120029858384464, 'BsmtFinSF1'),
 (0.02452598873077822, 'TotalBsmtSF'),
 (0.02399779806279528, 'FullBath'),
 (0.014540976376787423, 'LotArea'),
 (0.010229035718796561, 'GarageArea'),
 (0.009870726514603639, 'TotRmsAbvGrd'),
 (0.009499933183843002, 'GarageCars'),
 (0.009245747008900568, 'YearRemodAdd'),
 (0.008742296886419173, 'YearBuilt'),
 (0.007861664638568038, 'LotFrontage'),
 (0.006076963798691014, 'OpenPorchSF'),
 (0.005356668133515764, 'BsmtUnfSF'),
 (0.004841522932756324, 'BsmtQual_Gd'),
 (0.004774237929413179, 'MasVnrArea'),
 (0.004754751953391492, 'GarageYrBlt'),
 (0.00471156119331117, 'WoodDeckSF'),
 (0.0045032288661078865, 'OverallCond'),
 (0.003767003873472983, 'MoSold'),
 (0.0030948366688023483, 'GarageType_Detchd'),
 (0.0025896736427806362, 'Fireplaces'),
 (0.0018574511043616335, 'ExterQual_Gd'),
 (0.0016647241985763995, '

In [14]:
# keep values above 0.01 
cols = []

for fi, column in srtd_fi: 
    if fi >= 0.01:
        cols.append(column)

cols

['OverallQual',
 'GrLivArea',
 '2ndFlrSF',
 '1stFlrSF',
 'BsmtFinSF1',
 'TotalBsmtSF',
 'FullBath',
 'LotArea',
 'GarageArea']

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,random_state=1)

In [7]:
len(x_train)

820

In [8]:
len(x_test)

274

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
model = LinearRegression()
model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
model.score(X,Y)

0.9322311970220337

# Create Train Test Split