In [391]:
import pandas as pd

melb_data = pd.read_csv('data/melb_data_ps.csv', sep=',')

melb_df = melb_data.copy()

melb_df.drop(['index', 'Coordinates'], axis=1, inplace=True)

print(melb_data.columns)

if not all(column in melb_data.columns for column in ['Suburb']):
    print(None)


Index(['index', 'Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method',
       'SellerG', 'Date', 'Distance', 'Postcode', 'Bedroom', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount', 'Coordinates'],
      dtype='object')


In [392]:
total_rooms = melb_df['Rooms'] + melb_df['Bedroom'] + melb_df['Bathroom']

melb_df['MeanRoomsArea'] = melb_df['BuildingArea'] / total_rooms


diff_area = melb_df['BuildingArea'] - melb_df['Landsize']
sum_area = melb_df['BuildingArea'] + melb_df['Landsize']
melb_df['AreaRatio'] = diff_area/sum_area
display(melb_df['AreaRatio'])

0       -0.231707
1       -0.327660
2        0.056338
3        0.145455
4        0.083969
           ...   
13575   -0.676093
13576   -0.429185
13577   -0.551601
13578   -0.693060
13579   -0.527426
Name: AreaRatio, Length: 13580, dtype: float64

In [393]:
melb_df['Date'] = pd.to_datetime(melb_df['Date'], dayfirst=True)

def get_street_type(adress):
    check_list = ['S', 'W', 'N', 'E']
    
    adress = adress.split()
    street_type = adress[-1]
    
    if street_type[-1] in check_list:
        street_type = adress[-2]
    
    return street_type
        
street_types = melb_df['Address'].apply(get_street_type)

popular_stypes = street_types.value_counts().nlargest(10).index

melb_df['StreetType'] = street_types.apply(lambda x: x if x in popular_stypes else "other")

melb_df = melb_df.drop('Address', axis=1)

melb_df['StreetType'].unique()

array(['St', 'other', 'Rd', 'Gr', 'Ct', 'Dr', 'Pde', 'Pl', 'Cl', 'Cr',
       'Av'], dtype=object)

In [394]:
melb_df['WeekdaySale'] = melb_df['Date'].dt.weekday

def get_weekend(weekday):
    if weekday in [5, 6]:
        return 1    
    return 0

melb_df['Weekend'] = melb_df['WeekdaySale'].apply(get_weekend)

melb_df[melb_df['Weekend'] == 1]['Price'].mean()

1081198.6406956792

In [395]:
popular_sellers = melb_df['SellerG'].value_counts().nlargest(49).index

def isSelletPopular(seller):
    if seller in popular_sellers:
        return seller
    return 'other'

melb_df['SellerG'] = melb_df['SellerG'].apply(isSelletPopular)

melb_df[melb_df['SellerG'] == 'Nelson']['Price'].min() / melb_df[melb_df['SellerG'] == 'other']['Price'].min()

1.297709923664122