In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import sem
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

# Read CSV file

In [2]:
data = "Data/train.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Clean Data

In [3]:
df = df.drop(columns=["FireplaceQu", "Fence", "Alley", "MiscFeature", "PoolQC"]).dropna()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [4]:
df = pd.get_dummies(df, prefix_sep='_', drop_first=True)
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,0,1,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,0,1,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,0,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,0,1,0,0,0,1,0


In [5]:
# Store cleaned data as a csv:
df.to_csv("Data/cleaned_data.csv", index=False, header=True)

In [6]:
X = df.drop(columns = ["SalePrice","Id"])
Y = df["SalePrice"]

In [7]:
rf = RandomForestRegressor()
rf = rf.fit(X, Y)
rf.score(X, Y)

0.9779108481435719

In [8]:
srtd_fi = sorted(zip(rf.feature_importances_, X.columns), reverse=True)
srtd_fi

[(0.6132365538280576, 'OverallQual'),
 (0.09279866253462685, 'GrLivArea'),
 (0.03703729077307207, '2ndFlrSF'),
 (0.026632646705958735, 'BsmtFinSF1'),
 (0.026213564114611548, 'TotalBsmtSF'),
 (0.025630576061429135, '1stFlrSF'),
 (0.020818407029797936, 'FullBath'),
 (0.012652092646263425, 'LotArea'),
 (0.012414574552681917, 'GarageCars'),
 (0.00953107888033426, 'TotRmsAbvGrd'),
 (0.009066202561318758, 'GarageArea'),
 (0.008427429365096812, 'YearBuilt'),
 (0.008363602098874627, 'YearRemodAdd'),
 (0.008146871155616534, 'LotFrontage'),
 (0.005892431834947901, 'BsmtUnfSF'),
 (0.005207566567443543, 'OpenPorchSF'),
 (0.005065590206842762, 'GarageYrBlt'),
 (0.004570833046609516, 'MasVnrArea'),
 (0.004239482755482633, 'OverallCond'),
 (0.004069203621247924, 'WoodDeckSF'),
 (0.0037596960192672023, 'MoSold'),
 (0.002968030341610701, 'GarageType_Detchd'),
 (0.00280439610911334, 'Fireplaces'),
 (0.0026540671091294767, 'BsmtQual_Gd'),
 (0.001965977332462864, 'GarageFinish_Unf'),
 (0.00192205505279874

In [9]:
# keep values above 0.01 
cols = []

for fi, column in srtd_fi: 
    if fi >= 0.01:
        cols.append(column)

cols

['OverallQual',
 'GrLivArea',
 '2ndFlrSF',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 'FullBath',
 'LotArea',
 'GarageCars']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,random_state=1)

In [None]:
len(x_train)

In [None]:
len(x_test)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(X,Y)

In [None]:
model.score(X,Y)

# Create Train Test Split