In [52]:
import pandas as pd
import numpy as np

df = pd.read_json('../data/raw/example2.json')

In [53]:
# take a look at the size of the data
df.shape

(12, 11403)

In [54]:
# there are 12 rows in the data, but we have 12 features,
# so data needs to have 12 columns instead -- need to transpose this data
df = df.transpose()
df.head()

Unnamed: 0,name,cost_text,coordinates,Bed,Bath,Park,property_type,desc_head,desc,additional features,land_area_sqkm,internal_area_sqkm
https://www.domain.com.au/6-ironbark-lane-maidstone-vic-3012-16062815,6 Ironbark Lane Maidstone VIC 3012,$600 per week,Not Listed,4,2,2,House,A Superb Sense Of Style,"class=""css-dxogle"">* Unverified feature<svg a...",[],[],
https://www.domain.com.au/7-9-11-york-st-fitzroy-north-vic-3068-16071577,7/9 -11 York St Fitzroy North VIC 3068,$340.00,Not Listed,1,1,1,Apartment / Unit / Flat,The Perfect Spot!,Excellent one bedroom apartment in a small qui...,"[Secure Parking, Built in wardrobes, Floorboards]",[],[]
https://www.domain.com.au/16-tribe-street-south-melbourne-vic-3205-16032282,16 Tribe Street South Melbourne VIC 3205,1180.00,Not Listed,3,1,0,House,CLASSIC VICTORIAN GEM,"class=""css-dxogle"">* Unverified feature<svg a...","[Air conditioning, Intercom, Ensuite, Courtyar...",[],
https://www.domain.com.au/14-14-16-wardale-road-springvale-south-vic-3172-16042510,14/14-16 Wardale Road Springvale South VIC 3172,$420 p.w.,Not Listed,2,2,1,Apartment / Unit / Flat,Recently updated spacious Unit!!!,This recently renovated light filled family un...,[],[],[]
https://www.domain.com.au/3-106-burke-road-ferntree-gully-vic-3156-16051371,3/106 Burke Road Ferntree Gully VIC 3156,$475,Not Listed,4,1,2,Townhouse,Stylish Townhouse Close to Shops and Schools,"class=""css-dxogle"">* Unverified feature<svg a...",[],[],


### 1: Cost

In [55]:
# the cost of a real estate property is in dollars per week.
# I will transform all the values in the cost_text column into integers.

df2 = df.copy()
import re

for i in range(df2.shape[0]):
    cost = re.findall(r"[\d,]+", df2.iat[i, 1])

    if len(cost) == 0:
        df2.iat[i, 1] = np.nan

    else:
        # find the element with length 3, since it's the most likely one
        cost2 = [i for i in cost if len(i) == 3]

        if len(cost2) == 1:
            # if there's only one, then it's very likely that it's the correct price we want
            df2.iat[i, 1] = int(cost2[0])

        else:
            # otherwise, this is where things get wacky
            # might have to clean the data manually
            
            # some possibilities: really cheap price ($60 per week),
            # really expensive price ($1000+ per week),
            # only the monthly price is listed, while the weekly price is unknown,
            # only the annual price is listed, while the weekly price is unknown,
            # some other gibberish text that contains numbers that isn't the weekly price.
            print(df2.iat[i, 1])
            print(cost)
            print("")
            # TODO: after the real estate data is finalised, we can either clean this part manually, or simply ditch it

1180.00
['1180', '00']

$1,000.00 per week
['1,000', '00']

$1,450.00
['1,450', '00']

$1150 per week
['1150']

$1,500 weekly
['1,500']

$1,750.00 Per Week
['1,750', '00']

$1,250 per week
['1,250']

$1,150.00
['1,150', '00']

$2,000.00
['2,000', '00']

$1100 per week
['1100']

$1200 per week
['1200']

$1,250 Weekly
['1,250']

$2,500.00
['2,500', '00']

$1,295.00
['1,295', '00']

$1,199
['1,199']

$1,300 per week
['1,300']

$1450 per week
['1450']

$1,050.00
['1,050', '00']

$1,400.00
['1,400', '00']

$1199 per week
['1199']

$1,650.00
['1,650', '00']

$1800 per week
['1800']

$1,075.00
['1,075', '00']

$1,350.00
['1,350', '00']

$1015pw - Stay 1 mth+
['1015', '1']

$325 Per Week Until April 2022. Increase to $335 Per Week a
['325', '2022', '335']

$1,500.00
['1,500', '00']

$2,000.00
['2,000', '00']

$1,000
['1,000']

$2,500 per week
['2,500']

$1,250.00
['1,250', '00']

1,000 pw
['1,000']

$1,450.00
['1,450', '00']

$1,250 per week
['1,250']

$1000 per week
['1000']

$800 to $880
['8

### 2: Description

In [56]:
# might need to install these libraries first
import enchant   
import nltk
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # preprocesses a string of texts :)
    
    # step 1: remove non-alphabetic characters
    text = re.sub(r'[^a-z\sA-Z]', ' ', text)
    # step 2: convert all spacing characters such as tabs and newlines to 
    # whitespace and ensure that only one whitespace character exists between each word
    text = re.sub(r'\s+', ' ', text)
    # step 3: Change all uppercase characters to lower case
    text = text.lower()
    # step 4: tokenise into words
    words = text.split()
    # step 5: remove invalid English words and single-letter words, and then stemming
    d = enchant.Dict('en')
    ps = PorterStemmer()
    words = [ps.stem(i) for i in words if(d.check(i) == True) and (len(i) > 1)]
    return words

In [57]:
df3 = df2.copy()


for i in range(df3.shape[0]):
    # there are many instances with invalid descriptions, 
    # and fortunately, they seem relatively easy to find.
    invalid = re.findall(r'class="', df3.iat[i, 8])
    if invalid:
        df3.iat[i, 8] = np.nan

    # text preprocessing
    # I'm not sure how useful this "desc" attribute is going to be,
    # so if you wanna use this attribute, just uncomment this part of the code
    # just note that it will take a while to run
    #else:
    #    df3.iat[i, 8] = preprocess_text(df3.iat[i, 8])

df3.head()

Unnamed: 0,name,cost_text,coordinates,Bed,Bath,Park,property_type,desc_head,desc,additional features,land_area_sqkm,internal_area_sqkm
https://www.domain.com.au/6-ironbark-lane-maidstone-vic-3012-16062815,6 Ironbark Lane Maidstone VIC 3012,600.0,Not Listed,4,2,2,House,A Superb Sense Of Style,,[],[],
https://www.domain.com.au/7-9-11-york-st-fitzroy-north-vic-3068-16071577,7/9 -11 York St Fitzroy North VIC 3068,340.0,Not Listed,1,1,1,Apartment / Unit / Flat,The Perfect Spot!,Excellent one bedroom apartment in a small qui...,"[Secure Parking, Built in wardrobes, Floorboards]",[],[]
https://www.domain.com.au/16-tribe-street-south-melbourne-vic-3205-16032282,16 Tribe Street South Melbourne VIC 3205,1180.0,Not Listed,3,1,0,House,CLASSIC VICTORIAN GEM,,"[Air conditioning, Intercom, Ensuite, Courtyar...",[],
https://www.domain.com.au/14-14-16-wardale-road-springvale-south-vic-3172-16042510,14/14-16 Wardale Road Springvale South VIC 3172,420.0,Not Listed,2,2,1,Apartment / Unit / Flat,Recently updated spacious Unit!!!,This recently renovated light filled family un...,[],[],[]
https://www.domain.com.au/3-106-burke-road-ferntree-gully-vic-3156-16051371,3/106 Burke Road Ferntree Gully VIC 3156,475.0,Not Listed,4,1,2,Townhouse,Stylish Townhouse Close to Shops and Schools,,[],[],


### Final 3 features

In [58]:
# only need to replace empty list with NaN values
for i in range(df3.shape[0]):
    try:
        # there is definitely a better way to do this than try except, but this works so
        if len(df3.iat[i, -1]) == 0:
            df3.iat[i, -1] = np.nan
    except:
        pass
    if len(df3.iat[i, -2]) == 0:
        df3.iat[i, -2] = np.nan
    if len(df3.iat[i, -3]) == 0:
        df3.iat[i, -3] = np.nan

df3.head()

Unnamed: 0,name,cost_text,coordinates,Bed,Bath,Park,property_type,desc_head,desc,additional features,land_area_sqkm,internal_area_sqkm
https://www.domain.com.au/6-ironbark-lane-maidstone-vic-3012-16062815,6 Ironbark Lane Maidstone VIC 3012,600.0,Not Listed,4,2,2,House,A Superb Sense Of Style,,,,
https://www.domain.com.au/7-9-11-york-st-fitzroy-north-vic-3068-16071577,7/9 -11 York St Fitzroy North VIC 3068,340.0,Not Listed,1,1,1,Apartment / Unit / Flat,The Perfect Spot!,Excellent one bedroom apartment in a small qui...,"[Secure Parking, Built in wardrobes, Floorboards]",,
https://www.domain.com.au/16-tribe-street-south-melbourne-vic-3205-16032282,16 Tribe Street South Melbourne VIC 3205,1180.0,Not Listed,3,1,0,House,CLASSIC VICTORIAN GEM,,"[Air conditioning, Intercom, Ensuite, Courtyar...",,
https://www.domain.com.au/14-14-16-wardale-road-springvale-south-vic-3172-16042510,14/14-16 Wardale Road Springvale South VIC 3172,420.0,Not Listed,2,2,1,Apartment / Unit / Flat,Recently updated spacious Unit!!!,This recently renovated light filled family un...,,,
https://www.domain.com.au/3-106-burke-road-ferntree-gully-vic-3156-16051371,3/106 Burke Road Ferntree Gully VIC 3156,475.0,Not Listed,4,1,2,Townhouse,Stylish Townhouse Close to Shops and Schools,,,,


Before saving the dataframe to csv, additional preprocessing might be required

In [39]:
# save to csv
filename = ...
df3.to_csv(filename)

ValueError: Invalid file path or buffer object type: <class 'ellipsis'>