In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import pickle
import re

In [78]:
house_df = pd.read_csv(r'Craiglist_HouseData.csv')
house_df.head()
pd.set_option('display.max_rows', house_df.shape[0]+1)

In [10]:
house_df.shape

(2400, 12)

In [13]:
house_df.describe()

Unnamed: 0.1,Unnamed: 0,Price,Area
count,2400.0,2217.0,1679.0
mean,1200.5,1175405.0,4754.055
std,692.964646,8869287.0,98792.2
min,1.0,1.0,0.0
25%,600.75,479900.0,460.0
50%,1200.5,736000.0,930.0
75%,1800.25,1199000.0,1665.0
max,2400.0,414900900.0,3990096.0


In [15]:
house_df[house_df['Bedroom'] == 'NaN'] = np.nan

In [18]:
house_df['Bedroom'].unique()

array(['8', nan, '4', '5', '1', '6', '2', '3', '9', '7', 'r', '0'],
      dtype=object)

In [19]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2400 non-null   float64
 1   Price         2217 non-null   float64
 2   Bedroom       2112 non-null   object 
 3   Area          1679 non-null   float64
 4   Title         2400 non-null   object 
 5   Place         2309 non-null   object 
 6   Post Timing   2400 non-null   object 
 7   Finishing     2400 non-null   object 
 8   House Type    2338 non-null   object 
 9   Parking Type  987 non-null    object 
 10  Bathroom      2396 non-null   object 
 11  Link          2400 non-null   object 
dtypes: float64(3), object(9)
memory usage: 225.1+ KB


In [80]:
house_df[house_df['Bedroom'] == '0']

Unnamed: 0.1,Unnamed: 0,Price,Bedroom,Area,Title,Place,Post Timing,Finishing,House Type,Parking Type,Bathroom,Link
414,415,,0,13000.0,development site for sale,surrey bc,2020-07-08T02:30:40-0700,unfurnished,house,,2,https://vancouver.craigslist.org/rds/reo/d/sur...
471,472,839900.0,0,1479.0,TOWNHOUSE FOR SALE: 9 230 SALTER STREET,New Westminster,2020-07-07T13:45:16-0700,unfurnished,townhouse,,3,https://vancouver.craigslist.org/bnc/reb/d/new...
767,768,,0,3000.0,Store on 3000 sq on Granville Entertainment Di...,1132 Granville St. Next Chateau Granville Best...,2020-07-04T21:01:06-0700,unfurnished,apartment,,3,https://vancouver.craigslist.org/van/reo/d/sto...
1095,1096,1049000.0,0,0.0,House for Sale - 6056 Boundary Dr East,"Boundary Park, Surrey",2020-06-29T12:16:53-0700,unfurnished,house,,2,https://vancouver.craigslist.org/rds/reb/d/sur...
1820,1821,,0,850.0,Pitt Lake Cabin for Sale,Pitt Lake,2020-06-15T09:31:51-0700,unfurnished,,,2,https://vancouver.craigslist.org/pml/reo/d/pit...
2265,2266,1950000.0,0,0.0,4.1 Acres - Fraser Highway - Mt. Lehman,Abbotsford,2020-06-03T16:07:33-0700,unfurnished,house,,2,https://vancouver.craigslist.org/rds/reo/d/abb...


In [47]:
print('Bedroom unique values -', house_df['Bedroom'].unique(),'\n')
print('Bathroom unique values -', house_df['Bathroom'].unique(),'\n')
print('House Type -', house_df['House Type'].unique(),'\n')

Bedroom unique values - ['8' nan '4' '5' '1' '6' '2' '3' '9' '7' 'r' '0'] 

Bathroom unique values - ['7' '3' '4' '1' '2' '2.5' '5' '1.5' '3.5' '6' '5.5' '0' '4.5' '8' '6.5'
 'shared' '9+' nan '7.5' 'split' '8.5'] 

House Type - ['house' 'condo' 'apartment' 'townhouse' 'land' 'duplex' nan] 



In [77]:
house_df[house_df['Bathroom']=='7']

Unnamed: 0.1,Unnamed: 0,Price,Bedroom,Area,Title,Place,Post Timing,Finishing,House Type,Parking Type,Bathroom,Link
0,1,2278000.0,8.0,6.0,,Tsawwassen,2020-07-10T15:42:19-0700,unfurnished,house,attached garage,7,https://vancouver.craigslist.org/van/reb/d/del...
1,2,684900.0,,,,Burquitlam,2020-07-03T15:44:48-0700,unfurnished,condo,,7,https://vancouver.craigslist.org/bnc/reb/d/coq...
23,24,2985000.0,5.0,,,,2020-07-10T12:02:55-0700,unfurnished,land,,7,https://vancouver.craigslist.org/rds/reb/d/lan...
24,25,,,,,"Surrey, BC",2020-06-20T08:52:14-0700,unfurnished,house,,7,https://vancouver.craigslist.org/van/reb/d/sur...
25,26,2100000.0,,,,Bridgeview (Surrey),2020-07-10T11:54:09-0700,unfurnished,land,,7,https://vancouver.craigslist.org/rds/reo/d/sur...
26,27,,,,,CAMPBELL HEIGHTS,2020-06-25T06:51:21-0700,unfurnished,apartment,,7,https://vancouver.craigslist.org/van/reo/d/sur...
65,66,1524900.0,6.0,4458.0,,"South Surrey, BC",2020-06-26T08:36:15-0700,unfurnished,house,attached garage,7,https://vancouver.craigslist.org/rds/reb/d/sur...
170,171,5888000.0,5.0,5996.0,,West Vancouver,2020-07-09T11:56:37-0700,unfurnished,house,,7,https://vancouver.craigslist.org/nvn/reb/d/wes...
822,823,1998888.0,6.0,4886.0,,Tsawwassen,2020-06-11T22:10:00-0700,unfurnished,duplex,attached garage,7,https://vancouver.craigslist.org/rds/reb/d/del...
1020,1021,4288000.0,6.0,5308.0,,West Vancouver,2020-06-30T11:19:07-0700,unfurnished,house,,7,https://vancouver.craigslist.org/nvn/reb/d/wes...


In [51]:
house_df.iloc[2140]

Unnamed: 0                                                   2141
Price                                                   1.199e+06
Bedroom                                                         7
Area                                                         3781
Title                          Start Your HGTV Cooking Show HERE!
Place                                                     Langley
Post Timing                              2020-05-25T09:01:55-0700
Finishing                                             unfurnished
House Type                                                  house
Parking Type                                      attached garage
Bathroom                                                        7
Link            https://vancouver.craigslist.org/rds/reb/d/lan...
Name: 2140, dtype: object

In [71]:
house_df['Title'] = house_df['Title'].astype('str')

In [81]:
house_df.dtypes

Unnamed: 0        int64
Price           float64
Bedroom          object
Area            float64
Title            object
Place            object
Post Timing      object
Finishing        object
House Type       object
Parking Type     object
Bathroom         object
Link             object
dtype: object

In [79]:
house_df['Title'] = house_df['Title'].astype('str')
house_df['Title'] = house_df['Title'].str.replace(',', '')


In [94]:
house_df[((house_df['Title'].str.contains("acre")) | house_df['Title'].str.contains("Acre")) & (house_df['Area'].isnull())]

Unnamed: 0.1,Unnamed: 0,Price,Bedroom,Area,Title,Place,Post Timing,Finishing,House Type,Parking Type,Bathroom,Link
2,3,1098000.0,4.0,,Murrayville Basement Entry Home on Private 1/3...,(google map)\n,2020-07-10T15:16:32-0700,unfurnished,house,,3.0,https://vancouver.craigslist.org/rds/reb/d/lan...
9,10,859000.0,,,2.5 Acres near Parksville Vancouver Island. Ex...,Errington 12 min from Parksville,2020-06-13T12:13:12-0700,unfurnished,house,,2.0,https://vancouver.craigslist.org/van/reo/d/err...
25,26,2100000.0,,,1/2 Acre Industrial lot for sale,Bridgeview (Surrey),2020-07-10T11:54:09-0700,unfurnished,land,,7.0,https://vancouver.craigslist.org/rds/reo/d/sur...
77,78,1950000.0,2.0,,4.1 Acres - Fraser Highway - Mt. Lehman,Abbotsford,2020-07-10T08:06:24-0700,unfurnished,house,,2.0,https://vancouver.craigslist.org/rds/reo/d/abb...
176,177,1700000.0,,,equestrian 5+ acre estate FOR SALE within Sil...,Mission,2020-06-18T16:54:31-0700,unfurnished,apartment,,2.0,https://vancouver.craigslist.org/pml/reo/d/mis...
229,230,2999000.0,4.0,,7.314 Acres 2 Houses W/large Wood Working Shop,Abbotsford,2020-07-07T08:44:01-0700,unfurnished,house,,3.0,https://vancouver.craigslist.org/van/reo/d/abb...
230,231,2999000.0,4.0,,7.314 Acres 2 Houses W/large Wood Working Shop,Abbotsford,2020-07-07T08:47:33-0700,unfurnished,house,,3.0,https://vancouver.craigslist.org/rch/reo/d/abb...
231,232,2999000.0,4.0,,7.314 Acres 2 Houses W/large Wood Working Shop,Abbotsford,2020-07-07T08:51:04-0700,unfurnished,house,,3.0,https://vancouver.craigslist.org/bnc/reo/d/abb...
303,304,1349000.0,,,1 acre of land in burnaby big Bend area,,2020-07-08T15:30:43-0700,unfurnished,apartment,,1.0,https://vancouver.craigslist.org/rch/reo/d/ric...
493,494,,,,860 acre Ranch/Farm Land/ 2 water wells/Mexico,"Zacatecas, Mexico",2020-06-27T23:43:49-0700,unfurnished,land,,2.5,https://vancouver.craigslist.org/van/reo/d/bur...


In [90]:
house_df['Title'].isnull()

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
30      False
31      False
32      False
33      False
34      False
35      False
36      False
37      False
38      False
39      False
40      False
41      False
42      False
43      False
44      False
45      False
46      False
47      False
48      False
49      False
50      False
51      False
52      False
53      False
54      False
55      False
56      False
57      False
58      False
59      False
60      False
61      False
62      False
63      False
64      False
65      False
66      False
67      False
68      False
69      False
70      False
71    