### This Section preprocesses the raw rental property data scraped from domain 

In [48]:
import pandas as pd
import re

In [49]:
house_df = pd.read_csv('../../data/raw/BS_Raw_PropertiesInfo.csv')
house_df.head()

Unnamed: 0,bedrooms,bathrooms,carspaces,building_area,street_address,latitude,longitude,price
0,2.0,,,,2504/36 La Trobe Street Melbourne VIC 3000,-37.808164,144.967594,$540/week
1,2.0,,,,2606/157 A'beckett street Melbourne VIC 3000,-37.809916,144.959185,$690/w
2,2.0,,,,818/139 Lonsdale Street Melbourne VIC 3000,-37.810795,144.968304,$380 pw
3,1.0,,,,304/58 La Trobe Street Melbourne VIC 3000,-37.808429,144.967054,$380 per week
4,2.0,,1.0,,1716/135 A'Beckett Street Melbourne VIC 3000,-37.809788,144.959647,$630 per week


In [50]:
price_list = house_df['price']

In [51]:
house_df.head()

Unnamed: 0,bedrooms,bathrooms,carspaces,building_area,street_address,latitude,longitude,price
0,2.0,,,,2504/36 La Trobe Street Melbourne VIC 3000,-37.808164,144.967594,$540/week
1,2.0,,,,2606/157 A'beckett street Melbourne VIC 3000,-37.809916,144.959185,$690/w
2,2.0,,,,818/139 Lonsdale Street Melbourne VIC 3000,-37.810795,144.968304,$380 pw
3,1.0,,,,304/58 La Trobe Street Melbourne VIC 3000,-37.808429,144.967054,$380 per week
4,2.0,,1.0,,1716/135 A'Beckett Street Melbourne VIC 3000,-37.809788,144.959647,$630 per week


#### In addition, convert baths, beds, and parking string types into numeric types. Discard rows where no bedrooms and bathrooms were provided

In [52]:
house_df["bedrooms"] = pd.to_numeric(house_df["bedrooms"])
house_df["bathrooms"] = pd.to_numeric(house_df["bathrooms"])
house_df["carspaces"] = pd.to_numeric(house_df["carspaces"])


In [53]:
house_df["building_area"] = house_df["building_area"].str.extract('(\d+[,.]?\d+)')
house_df["building_area"] = house_df["building_area"].str.replace(',', '')
#house_df.loc[house_df["building_area"].str.contains(',', na=False)]
house_df["building_area"] = pd.to_numeric(house_df["building_area"])

# Remove area < 10 since this is most likely recorded in hactares
house_df = house_df.drop(house_df[house_df["building_area"] < 10].index)

# Remove rows where bedrooms and bathrooms info were missed
house_df2 = house_df.dropna(subset=['bedrooms', 'bathrooms'])
house_df2.reset_index(drop = True, inplace = True)
print(f'{len(house_df) - len(house_df2)} instances were dropped as no bedroom or bathroom numbers were demonstrated')

7205 instances were dropped as no bedroom or bathroom numbers were demonstrated


In [54]:
house_df2

Unnamed: 0,bedrooms,bathrooms,carspaces,building_area,street_address,latitude,longitude,price
0,2.0,2.0,1.0,,1407/601 Little Lonsdale Street Melbourne VIC ...,-37.813756,144.953714,$750 per week
1,2.0,2.0,,,4507/318 Russell Street Melbourne VIC 3000,-37.809171,144.966801,$650 per week
2,2.0,2.0,,,816/101 Therry St Melbourne VIC 3000,-37.807415,144.959123,New apartment No bond needed Pets welcome
3,2.0,2.0,,,5101/464 Collins Street Melbourne VIC 3000,-37.817560,144.958592,$650
4,2.0,2.0,1.0,,300/668 Bourke Street Melbourne VIC 3000,-37.816615,144.954279,$550.00
...,...,...,...,...,...,...,...,...
5329,4.0,2.0,2.0,820.91,25 Mary Street Wonthaggi VIC 3995,-38.594222,145.603470,$520
5330,4.0,2.0,2.0,603.15,2 Seam Street Wonthaggi VIC 3995,-38.606574,145.580814,$500
5331,4.0,2.0,,,44 Nelson Street Wonthaggi VIC 3995,-38.590248,145.594281,$400
5332,3.0,2.0,1.0,,67 Campbell Street Wonthaggi VIC 3995,-38.610943,145.576774,$480 per week (including garden maintenance)


#### Now, do some descriptive data analytics for finding data problems and solve those problems. For instance, utilize scatter plot for checking outliers within data or utilize histogram to watch data distribution etc.

### Add distance to city and postcode attributes.

In [55]:
import math
cleaned_house_df = house_df2
# radius of earth is 6378
r = 6378
dis_to_city = []
for i in range(len(cleaned_house_df)):
    
    lat1_n = math.radians(-37.818078)
    lat2 = math.radians(float(cleaned_house_df['latitude'][i]))
    
    lon1_n = math.radians(144.96681)
    lon2 = math.radians(float(cleaned_house_df['longitude'][i]))
    
    lon_diff_n = lon2 - lon1_n
    lat_diff_n = lat2 - lat1_n
    
    a_n = math.sin(lat_diff_n / 2)**2 + math.cos(lat1_n) * math.cos(lat2) * math.sin(lon_diff_n / 2)**2
    c_n = 2 * math.atan2(math.sqrt(a_n), math.sqrt(1 - a_n))
    
    dis_to_city.append(round(r*c_n, 4))
    
cleaned_house_df['distance_to_city'] = dis_to_city
cleaned_house_df['postcode'] = cleaned_house_df['street_address'].str[-4:]

cleaned_house_df.head()
#cleaned_house_df.to_csv(f'BS_price_unclean_price.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_house_df['distance_to_city'] = dis_to_city
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_house_df['postcode'] = cleaned_house_df['street_address'].str[-4:]


Unnamed: 0,bedrooms,bathrooms,carspaces,building_area,street_address,latitude,longitude,price,distance_to_city,postcode
0,2.0,2.0,1.0,,1407/601 Little Lonsdale Street Melbourne VIC ...,-37.813756,144.953714,$750 per week,1.2481,3000
1,2.0,2.0,,,4507/318 Russell Street Melbourne VIC 3000,-37.809171,144.966801,$650 per week,0.9915,3000
2,2.0,2.0,,,816/101 Therry St Melbourne VIC 3000,-37.807415,144.959123,New apartment No bond needed Pets welcome,1.366,3000
3,2.0,2.0,,,5101/464 Collins Street Melbourne VIC 3000,-37.81756,144.958592,$650,0.725,3000
4,2.0,2.0,1.0,,300/668 Bourke Street Melbourne VIC 3000,-37.816615,144.954279,$550.00,1.1139,3000


### Preprocess rental price

In [58]:
df = cleaned_house_df
df['price'] = df["price"].str.replace(",","")
df['price'] = df["price"].str.replace(" ","_") # replacing blank space with underline will simplify code in regex

regex_str1 = r'([$]?\d+[.]?\d+\w+[.]*\w*[/]*\w*[wW])' # $650.00 per week
regex_str2 = r'([$]?\d+[.]?\d+$)' # $320
regex_str3 = r'([$]?\d+[.]?\d+\w+[-]?\w+[d]$)' # $490 Fully Furnished
regex_str4 = r'([$]?\d+[.]?\d+\w*[*]+\w*[*]+$)' # $750_**SPACIOUS_APARTMENT**

df['price1']=df['price'].str.extract(regex_str1)
df['price2']=df['price'].str.extract(regex_str2)
df['price3']=df['price'].str.extract(regex_str3)
df['price4']=df['price'].str.extract(regex_str4)
df['weekly_rent'] = df['price1'].where(df['price1'].notnull(), df['price2'])
df['weekly_rent'] = df['weekly_rent'].where(df['weekly_rent'].notnull(), df['price3'])
df['weekly_rent'] = df['weekly_rent'].where(df['weekly_rent'].notnull(), df['price4'])
df = df.drop(['price1','price2','price3', 'price4'],axis=1)
#dff = df[['price', 'weekly_rent']]
#dff.to_csv("../../data/curated/BS_re.csv")
#dff1 = dff[dff['weekly_rent'].isna()]
#dff1.to_csv('../../data/curated/BE_re_null.csv')

df2 = df.dropna(subset = ['weekly_rent'])
df2['weekly_rent'] = df2['weekly_rent'].str.extract('(\d+)').astype(int)
df3 = df2[df2['weekly_rent'] > 10000] # drop the yearly rent ones.

result =  df2[df2['weekly_rent'] < 10000]
result.to_csv("../../data/curated/BS_re_clean.csv")


print(f'{len(df) - len(df2)} instances were dropped as no weekly rent was demonstrated')
print(f'{len(df3)} instances were dropped as it was annual rent')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] = df["price"].str.replace(",","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] = df["price"].str.replace(" ","_") # replacing blank space with underline will simplify code in regex
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price1']=df['price'].str.extract(regex_st

129 instances were dropped as no weekly rent was demonstrated
1 instances were dropped as it was annual rent


#### The last step is exporting Dataframe to some other tabular formats file including a CSV or an excel file.

In [57]:
# exporting to csv file
#cleaned_house_df.to_csv('BS_Clean_PropertiesInfo.csv', index=False)