In [1]:
# importing basic libraries for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train=pd.read_csv("train.csv") #training data

In [3]:
test=pd.read_csv("test.csv")    #test data

In [4]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


### Preprocessing the datasets

In [5]:
# Checking for null values
train.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [6]:
# dropping columns with too many null values
col_drop = ['LotFrontage','Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
train.drop(col_drop, axis=1, inplace=True)
test.drop(col_drop, axis=1, inplace=True)

In [7]:
col_median = ['MasVnrArea','GarageYrBlt','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','GarageArea']
for col in col_median:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(test[col].median(), inplace=True)

In [8]:
col_mode = ['MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
                    'Electrical','GarageType','GarageFinish','GarageQual','GarageCond','MSZoning',
                   'Utilities','Exterior1st','Exterior2nd','KitchenQual','Functional','SaleType',
                   'BsmtFullBath','BsmtHalfBath','GarageCars']
for col in col_mode:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

In [9]:
# Encoding column datatypes
from sklearn.preprocessing import LabelEncoder
col_label = []
for col in train.columns:
    if train[col].dtypes == 'object':
        col_label.append(col)
for col in col_label:
    train[col]= LabelEncoder().fit_transform(train[col]) 
    test[col]= LabelEncoder().fit_transform(test[col]) 

In [10]:
train.shape

(1460, 75)

In [12]:
X=train.drop('SalePrice',axis=1) # non-target attributes
y=train['SalePrice']             #target attributes

In [13]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3,random_state=0)

## Building our model and predicting prices

In [14]:
from sklearn.ensemble import RandomForestRegressor as rfr
model=rfr()
model.fit(X_train,y_train)

RandomForestRegressor()

In [15]:
y_pred=model.predict(X_test)

### Checking the performance of our model

In [22]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
print(f'Mean Squared error: {mse(y_pred,y_test)} ')
print(f'Accuracy score of our model: {r2_score(y_pred,y_test)} ')


Mean Squared error: 901420720.3043108 
Accuracy score of our model: 0.8354475542789652 


<br><br><br><br><b>The link to my collab notebook:</b><br>https://colab.research.google.com/drive/1narmjgQPYVnqeOBwlH9mOXVnV86ELmTQ?usp=sharing