In [None]:
def fix_suburb_names_in_historical_rental_data():
    
    """
    Fix suburb names in the historical rental data 
    that contain non-alphabetic chars and/or unusual
    naming conventions. 
    
    Will be split into 3 sections I, II, III 
    
    """
    
    
    historical_rental_data = pd.read_csv('../data/curated/Monthly_Median_Suburb_Rents.csv').drop(columns=['Unnamed: 0'])
    
    #I. Split SA2s that encompass more than 1 suburb, usually indicated by '-'
    
    #BUT those of the form SUBURB - DIRECTION (DIRECTION e.g. north, east, etc)
    #are SA2s that only encompass a *part* of a suburb
    #Note: $ is a metacharacter in regex, stating the end of the string
    SA2s_rental_w_multiple_suburbs = historical_rental_data['Suburb'][
        (historical_rental_data['Suburb'].str.contains('-')) & 
        ~(historical_rental_data['Suburb'].str.contains('- North$')) &
        ~(historical_rental_data['Suburb'].str.contains('- South$')) &
        ~(historical_rental_data['Suburb'].str.contains('- East$')) &
        ~(historical_rental_data['Suburb'].str.contains('- West$')) 
    ]

    #split the suburbs in each SA2 to individual suburbs 
    SA2s_rental_w_multiple_suburbs = SA2s_rental_w_multiple_suburbs.str.split(r'\s*-\s*')


    #create empty dataframe with empty columns, for storing instances where SA2s 
    #comprised of multiple suburbs
    rental_data_SA2s_w_multiple_suburbs = pd.DataFrame(columns=historical_rental_data.columns)

    #extract the indexes
    SA2s_rental_w_multiple_suburbs_index = SA2s_rental_w_multiple_suburbs.index

    #iterator variable for obtaining indexes of instances for which SA2s are 
    #located in SA2s_unemp_data_w_multiple_suburbs
    i = 0 

    for multiple_suburbs_in_a_SA2 in SA2s_rental_w_multiple_suburbs:

        #extract the corresponding index, so that we can modify the instance later
        idx = SA2s_rental_w_multiple_suburbs_index[i]


        #for each individual suburb in the SA2
        for individual_suburb in multiple_suburbs_in_a_SA2:

            #extract instance where its SA2 corresponding 
            #to multiple_suburbs_in_a_SA2
            inst = historical_rental_data.loc[idx,:]

            #modify the SA2_Name, that contain multiple_suburbs_in_a_SA2,  
            #into the individual suburb "individual_suburb"

            inst['Suburb'] = individual_suburb

            #stack the row "inst" into unemp_data_SA2s_w_multiple_suburbs
            rental_data_SA2s_w_multiple_suburbs =\
                pd.concat([
                    pd.DataFrame([inst],columns=historical_rental_data.columns), 
                    rental_data_SA2s_w_multiple_suburbs
                  ],
                  axis=0)


        i += 1


    rental_data_no_SA2_w_mult_subs = historical_rental_data[~historical_rental_data.index.\
                                                            isin(SA2s_rental_w_multiple_suburbs.index)].\
    copy()

    rental_data_all_suburb = pd.concat([rental_data_no_SA2_w_mult_subs,
               rental_data_SA2s_w_multiple_suburbs],
              axis=0)


    #II. retain observations in 4th quarter of rental data
    
    rental_data_all_suburb = rental_data_all_suburb[
        rental_data_all_suburb['Quarter'] == 4
    ]

    #III. as a result of collapsing subrubs (or sa2s, rather) with dashes, 
    #we have to make sure that all of them are indeed suburbs. Otherwise
    #if we can fix their namings, we fix them. 

    #obtain all subrubs that are not in the lookup data 
    #these are listed in 'suburbs_before_fixed' var below

    #Before that, remove obvious non-suburbs in rental data
    rental_data_all_suburb = rental_data_all_suburb[
        ~rental_data_all_suburb['Suburb'].isin(['Yarra Ranges', 'St Kilda Rd', 'CBD'])
    ]


    #do fuzzy string matching between suburbs (that dont exist in lookup  data)
    #to suburbs in lookup data. This is the result (code for fuzzy string matching not included)
    suburbs_before_fixed = ['East St Kilda','East Hawthorn', 'East Brunswick','Ballarat',
     'Wanagaratta',
     'Bendigo East',
     'West St Kilda',
     'Mt Eliza',
     'Newcombe',
     'West Brunswick',
     'Mt Martha']


    suburbs_fixed = ['St Kilda', 'Hawthorn East', 
                     'Brunswick', 'Ballarat East', 'Wangaratta',
                    'Bendigo', 'St Kilda West', 'Mount Eliza',
                    'Newcomb', 'Brunswick West', 'Mount Martha']

    fix_ambiguous_rental_suburbs = dict(zip(suburbs_before_fixed, suburbs_fixed))

    #not all suburbs need fixing. So other "safe" suburbs can just stay as they are!
    def fix_ambiguous_suburbs_rental(suburb):
        """For the ambiguous suburbs above, map them to the
        correct ones (in the rental data)"""

        if suburb not in fix_ambiguous_rental_suburbs:
            return suburb
        else: 
            return fix_ambiguous_rental_suburbs[suburb]

    #fix the ambiguous suburbs 
    rental_data_all_suburb['Suburb'] = rental_data_all_suburb['Suburb'].map(
        lambda suburb: fix_ambiguous_suburbs_rental(suburb))
    
    #lastly, casefold all suburb names into lower case.
    rental_data_all_suburb['Suburb'] = rental_data_all_suburb['Suburb'].\
                            str.lower()

    return rental_data_all_suburb