In [1]:
#import required packages
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape

(1460, 81)

In [4]:
test.shape

(1459, 80)

In [5]:
#log transforming sale price to transform it into gaussian distribution

target = np.log(train.SalePrice)

In [6]:
#seeking only the numeric features from the data
numeric_features = train.select_dtypes(include = [np.number])
numeric_features.dtypes

In [7]:
#features with the most correlation with the predictor variable
corr = numeric_features.corr()
print(corr['SalePrice'].sort_values(ascending = False)[:5], '\n')
print(corr['SalePrice'].sort_values(ascending = False)[-5:])

SalePrice      1.000000
OverallQual    0.790982
GrLivArea      0.708624
GarageCars     0.640409
GarageArea     0.623431
Name: SalePrice, dtype: float64 

YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePrice, dtype: float64


In [8]:
train.OverallQual.unique()

array([ 7,  6,  8,  5,  9,  4, 10,  3,  1,  2])

In [9]:
#removing the outliers
train = train[train['GarageArea'] < 1200]

In [10]:
#checking the null values
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending = False)[:25])
nulls.columns = ['Null Count']
nulls.index.name = 'Feature'
nulls[:5]

Unnamed: 0_level_0,Null Count
Feature,Unnamed: 1_level_1
PoolQC,1449
MiscFeature,1402
Alley,1364
Fence,1174
FireplaceQu,689


In [11]:
print('Unique values are:', train.MiscFeature.unique())

Unique values are: [nan 'Shed' 'Gar2' 'Othr' 'TenC']


In [12]:
#analysing the categorical data
categoricals = train.select_dtypes(exclude= [np.number])
categoricals.describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1455,1455,91,1455,1455,1455,1455,1455,1455,1455,...,1374,1374,1374,1374,1455,6,281,53,1455,1455
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1147,1450,50,921,1309,1454,1048,1378,225,1257,...,867,605,1306,1321,1335,2,157,48,1266,1196


In [13]:
print ("Original: \n") 
print (train.Street.value_counts(), "\n")

Original: 

Pave    1450
Grvl       5
Name: Street, dtype: int64 



In [14]:
#One-hot encoding to convert the categorical data into integer data
train['enc_street'] = pd.get_dummies(train.Street, drop_first= True)
test['enc_street'] = pd.get_dummies(test.Street, drop_first= True)

In [15]:
print('Encoded: \n')
print(train.enc_street.value_counts())

Encoded: 

1    1450
0       5
Name: enc_street, dtype: int64


In [16]:
def encode(x): 
    if x == 'Partial':
        return 1
    else:
        return 0        

In [17]:
#Treating partial as one class and other all sale condition as other
train['enc_condition'] = train.SaleCondition.apply(encode)
test['enc_condition'] = test.SaleCondition.apply(encode)

In [18]:
#Handling the missing values by interpolation
data = train.select_dtypes(include= [np.number]).interpolate().dropna()



In [32]:
data1 = train.select_dtypes(include= [np.number]).interpolate().dropna()

In [19]:
#Verifying missing values
sum(data.isnull().sum() != 0)

0

In [33]:
#Verifying missing values
sum(data1.isnull().sum() != 0)

0

In [20]:
#log transforming the target variable to improve the linearity of the regression
y = np.log(train.SalePrice)
#dropping the target variable and the index from the training set
X = data.drop(['SalePrice', 'Id'], axis = 1)

In [21]:
#splitting the data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = .33)

In [22]:
#Linear regression model
from sklearn import linear_model
lr = linear_model.LinearRegression()

In [23]:
#fitting linear regression on the data
model = lr.fit(X_train, y_train)

In [25]:
#predicting on the test set
predictions = model.predict(X_test)

In [26]:
#evaluating the model on mean square error
from sklearn.metrics import mean_squared_error, accuracy_score
print('RMSE is {}'.format(mean_squared_error(y_test, predictions)))

RMSE is 0.017841794519568334


In [38]:
for p, y in zip(predictions, y_test):
    print(p, y)

11.913904366252964 12.122691036591123
12.049030471963505 12.122147410592412
11.773519707714978 11.884489021402711
11.791837536602713 11.652687407345388
11.269447229386294 11.33857207782545
11.3625933578852 11.231887935237117
12.336240695061015 12.649154617209645
11.706926212246858 11.560762794384388
12.02044887632714 12.078239274020289
11.89282643153847 11.816726919301892
11.80303488152218 11.813030057420567
12.525491235898748 12.791356180214175
12.235830764415901 12.271392111650817
12.662959872018778 12.691580461311874
11.902966354591179 11.881034786534624
11.621284175561044 11.693161515273161
12.19191550275396 12.119969946476761
11.619693169213493 11.84653646931041
12.37146944687832 12.373703486914124
12.273163058557568 12.328290278254423
11.333279417885343 11.289781913656018
12.363036703890721 12.542544882151386
11.448048588678922 11.831379196088763
11.721340092351369 11.792449349720545
12.191047163874327 12.165250651009918
11.911617692567713 11.94405831982697
11.643382601881584 11.