In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [5]:
s = (train.dtypes == 'object')
object_cols = list(s[s].index)

object_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [6]:
e = (train.isnull().any() == True)
missing_value_columns = e[e].index.tolist()
missing_value_columns

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [7]:
dropped_train = train.drop(missing_value_columns, axis=1)
dropped_test = test.drop(missing_value_columns, axis=1)

In [8]:
dtypes_of_missing_values = train[missing_value_columns].dtypes
dtypes_of_missing_values

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

In [9]:
f = (train[missing_value_columns].dtypes == float)
missing_float_types = f[f].index.tolist()
missing_float_types

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [10]:
imputed_train = train.copy()
imputed_test = test.copy()
imputed_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
imputed_train[missing_float_types]

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt
0,65.0,196.0,2003.0
1,80.0,0.0,1976.0
2,68.0,162.0,2001.0
3,60.0,0.0,1998.0
4,84.0,350.0,2000.0
...,...,...,...
1455,62.0,0.0,1999.0
1456,85.0,119.0,1978.0
1457,66.0,0.0,1941.0
1458,68.0,0.0,1950.0


In [12]:
from sklearn.impute import SimpleImputer
import numpy as np

mean_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

imputed_train['LotFrontage'] = mean_imputer.fit_transform(pd.DataFrame(imputed_train['LotFrontage']))
imputed_test['LotFrontage'] = mean_imputer.fit_transform(pd.DataFrame(imputed_test['LotFrontage']))

In [13]:
zero_imputer = SimpleImputer(missing_values = np.nan, fill_value = 0)

imputed_train['MasVnrArea'] = zero_imputer.fit_transform(pd.DataFrame(imputed_train['MasVnrArea']))
imputed_test['MasVnrArea'] = zero_imputer.fit_transform(pd.DataFrame(imputed_test['MasVnrArea']))

In [14]:
mean_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')

imputed_train['GarageYrBlt'] = mean_imputer.fit_transform(pd.DataFrame(imputed_train['GarageYrBlt']))
imputed_test['GarageYrBlt'] = mean_imputer.fit_transform(pd.DataFrame(imputed_test['GarageYrBlt']))

In [15]:
o = (train[missing_value_columns].dtypes == 'object')
missing_object_types = o[o].index.tolist()
missing_object_types

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [16]:
missing_imputer = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 'unknown')

imputed_train[missing_object_types] = missing_imputer.fit_transform(pd.DataFrame(imputed_train[missing_object_types]))
imputed_test[missing_object_types] = missing_imputer.fit_transform(pd.DataFrame(imputed_test[missing_object_types]))

In [17]:
imputed_train.isnull().any().sum()

0

In [18]:
object_cols = (train.dtypes == 'object')
object_cols = object_cols[object_cols].index.tolist()
object_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [19]:
from sklearn.preprocessing import OrdinalEncoder

labeled_train = imputed_train.copy()
labeled_test = imputed_test.copy()

In [20]:
ordinal_encoder = OrdinalEncoder()

labeled_train[object_cols] = ordinal_encoder.fit_transform(imputed_train[object_cols])
labeled_test[object_cols] = ordinal_encoder.fit_transform(imputed_test[object_cols])

In [21]:
labeled_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3.0,65.0,8450,1.0,2.0,3.0,3.0,0.0,...,0,3.0,4.0,4.0,0,2,2008,8.0,4.0,208500
1,2,20,3.0,80.0,9600,1.0,2.0,3.0,3.0,0.0,...,0,3.0,4.0,4.0,0,5,2007,8.0,4.0,181500
2,3,60,3.0,68.0,11250,1.0,2.0,0.0,3.0,0.0,...,0,3.0,4.0,4.0,0,9,2008,8.0,4.0,223500
3,4,70,3.0,60.0,9550,1.0,2.0,0.0,3.0,0.0,...,0,3.0,4.0,4.0,0,2,2006,8.0,0.0,140000
4,5,60,3.0,84.0,14260,1.0,2.0,0.0,3.0,0.0,...,0,3.0,4.0,4.0,0,12,2008,8.0,4.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3.0,62.0,7917,1.0,2.0,3.0,3.0,0.0,...,0,3.0,4.0,4.0,0,8,2007,8.0,4.0,175000
1456,1457,20,3.0,85.0,13175,1.0,2.0,3.0,3.0,0.0,...,0,3.0,2.0,4.0,0,2,2010,8.0,4.0,210000
1457,1458,70,3.0,66.0,9042,1.0,2.0,3.0,3.0,0.0,...,0,3.0,0.0,2.0,2500,5,2010,8.0,4.0,266500
1458,1459,20,3.0,68.0,9717,1.0,2.0,3.0,3.0,0.0,...,0,3.0,4.0,4.0,0,4,2010,8.0,4.0,142125


In [22]:
labeled_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,2.0,80.0,11622,1.0,2.0,3.0,3.0,0.0,...,120,0,2.0,2.0,3.0,0,6,2010,8.0,4.0
1,1462,20,3.0,81.0,14267,1.0,2.0,0.0,3.0,0.0,...,0,0,2.0,4.0,0.0,12500,6,2010,8.0,4.0
2,1463,60,3.0,74.0,13830,1.0,2.0,0.0,3.0,0.0,...,0,0,2.0,2.0,3.0,0,3,2010,8.0,4.0
3,1464,60,3.0,78.0,9978,1.0,2.0,0.0,3.0,0.0,...,0,0,2.0,4.0,3.0,0,6,2010,8.0,4.0
4,1465,120,3.0,43.0,5005,1.0,2.0,0.0,1.0,0.0,...,144,0,2.0,4.0,3.0,0,1,2010,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,4.0,21.0,1936,1.0,2.0,3.0,3.0,0.0,...,0,0,2.0,4.0,3.0,0,6,2006,8.0,4.0
1455,2916,160,4.0,21.0,1894,1.0,2.0,3.0,3.0,0.0,...,0,0,2.0,4.0,3.0,0,4,2006,8.0,0.0
1456,2917,20,3.0,160.0,20000,1.0,2.0,3.0,3.0,0.0,...,0,0,2.0,4.0,3.0,0,9,2006,8.0,0.0
1457,2918,85,3.0,62.0,10441,1.0,2.0,3.0,3.0,0.0,...,0,0,2.0,2.0,2.0,700,7,2006,8.0,4.0


In [24]:
from sklearn.ensemble import RandomForestClassifier

y_train = labeled_train['SalePrice']
x_train  = labeled_train.drop('SalePrice', axis = 1)

model = RandomForestClassifier()
model.fit(x_train, y_train)

In [25]:
predictions = model.predict(labeled_test)

predictions

array([120500, 147500, 181000, ..., 160000,  93500, 240000], dtype=int64)

In [32]:
df_predictions = pd.DataFrame(predictions, columns=['saleprice'])
df_predictions['id'] = np.arange(1461, 1461 + len(predictions))
df_predictions = df_predictions[['id', 'saleprice']]
df_predictions

Unnamed: 0,id,saleprice
0,1461,120500
1,1462,147500
2,1463,181000
3,1464,181000
4,1465,189000
...,...,...
1454,2915,75000
1455,2916,75000
1456,2917,160000
1457,2918,93500


In [33]:
df_predictions.to_csv('pred.csv', index=False)