In [179]:
import pandas as pd                 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [180]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [181]:
train.shape

(1460, 81)

In [182]:
test.shape

(1459, 80)

In [183]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [184]:
train.SalePrice.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [185]:
target = np.log(train.SalePrice)

In [186]:
target.skew()

0.12133506220520406

In [187]:
numeric_features = train.select_dtypes(include=[np.number])

In [188]:
numeric_features.dtypes

Id                 int64
MSSubClass         int64
LotFrontage      float64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
MoSold             int64
YrSold             int64
SalePrice          int64
dtype: object

In [189]:
corr = numeric_features.corr()

In [190]:
corr['SalePrice'].sort_values(ascending=False)[:5]

SalePrice      1.000000
OverallQual    0.790982
GrLivArea      0.708624
GarageCars     0.640409
GarageArea     0.623431
Name: SalePrice, dtype: float64

In [191]:
corr['SalePrice'].sort_values(ascending=False)[-5:]

YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePrice, dtype: float64

In [192]:
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending=False)[:25])
nulls.columns = ['Null Count']
nulls.index.name = 'Feature'

In [193]:
train.MiscFeature.unique()

array([nan, 'Shed', 'Gar2', 'Othr', 'TenC'], dtype=object)

In [194]:

categoricals = train.select_dtypes(exclude=[np.number])
categoricals.describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


In [195]:
train.Street.value_counts()

Pave    1454
Grvl       6
Name: Street, dtype: int64

In [196]:

train['enc_street'] = pd.get_dummies(train.Street, drop_first=True)
test['enc_street'] = pd.get_dummies(test.Street, drop_first=True)

In [197]:
train.enc_street.value_counts()

1    1454
0       6
Name: enc_street, dtype: int64

In [198]:
def encode(x): return 1 if x == 'Partial' else 0
train['enc_condition'] = train.SaleCondition.apply(encode)
test['enc_condition'] = test.SaleCondition.apply(encode)

In [199]:
data = train.select_dtypes(include=[np.number]).interpolate().dropna()

In [200]:
# Check if the all of the columns have 0 null values.
# sum(data.isnull().sum() != 0)


In [201]:
y = np.log(train.SalePrice)
X = data.drop(['SalePrice', 'Id'], axis=1)

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.33)

In [203]:
X_train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,enc_street,enc_condition
615,85,80.000000,8800,6,7,1963,1963,156.0,763,0,...,0,0,0,0,0,0,5,2010,1,0
613,20,70.000000,8402,5,5,2007,2007,0.0,206,0,...,30,0,0,0,0,0,12,2007,1,1
1303,20,73.000000,8688,7,5,2005,2005,228.0,0,0,...,59,0,0,0,0,0,4,2006,1,0
486,20,79.000000,10289,5,7,1965,1965,168.0,836,0,...,0,0,0,0,0,0,6,2007,1,0
561,20,77.000000,10010,5,5,1974,1975,0.0,1071,123,...,38,0,0,0,0,0,4,2006,1,0
308,30,98.666667,12342,4,5,1940,1950,0.0,262,0,...,0,0,0,0,0,0,3,2009,1,0
461,70,60.000000,7200,7,9,1936,2007,0.0,350,210,...,0,0,0,0,0,0,4,2009,1,0
1142,60,77.000000,9965,8,5,2006,2007,340.0,1150,0,...,144,0,0,0,0,0,4,2007,1,1
730,120,39.000000,5389,8,5,1995,1996,0.0,1180,0,...,152,0,0,0,0,0,3,2010,1,0
1155,20,90.000000,10768,5,8,1976,2004,0.0,1157,0,...,21,0,0,180,0,0,7,2007,1,0


In [204]:
 lr = linear_model.LinearRegression()

In [205]:

model = lr.fit(X_train, y_train)

In [206]:
 model.score(X_test, y_test)

0.864746341057823

In [207]:
# from sklearn.linear_model import LogisticRegression

In [208]:
# logr=LogisticRegression()

In [209]:
# logr.fit(X_train,y_train)
# logr.score(X_test, y_test)