In [157]:
import pandas as pd

melb_data = pd.read_csv('data/melb_data_ps.csv', sep=',')

melb_df = melb_data.copy()

melb_df.drop(['index', 'Coordinates'], axis=1, inplace=True)

print(melb_data.columns)

if not all(column in melb_data.columns for column in ['Suburb']):
    print(None)


Index(['index', 'Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method',
       'SellerG', 'Date', 'Distance', 'Postcode', 'Bedroom', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount', 'Coordinates'],
      dtype='object')


In [158]:
total_rooms = melb_df['Rooms'] + melb_df['Bedroom'] + melb_df['Bathroom']

melb_df['MeanRoomsArea'] = melb_df['BuildingArea'] / total_rooms


diff_area = melb_df['BuildingArea'] - melb_df['Landsize']
sum_area = melb_df['BuildingArea'] + melb_df['Landsize']
melb_df['AreaRatio'] = diff_area/sum_area
display(melb_df['AreaRatio'])

0       -0.231707
1       -0.327660
2        0.056338
3        0.145455
4        0.083969
           ...   
13575   -0.676093
13576   -0.429185
13577   -0.551601
13578   -0.693060
13579   -0.527426
Name: AreaRatio, Length: 13580, dtype: float64

In [159]:
melb_df['Date'] = pd.to_datetime(melb_df['Date'], dayfirst=True)

def get_street_type(adress):
    check_list = ['S', 'W', 'N', 'E']
    
    adress = adress.split()
    street_type = adress[-1]
    
    if street_type[-1] in check_list:
        street_type = adress[-2]
    
    return street_type
        
street_types = melb_df['Address'].apply(get_street_type)

popular_stypes = street_types.value_counts().nlargest(10).index

melb_df['StreetType'] = street_types.apply(lambda x: x if x in popular_stypes else "other")

melb_df = melb_df.drop('Address', axis=1)

melb_df['StreetType'].unique()

array(['St', 'other', 'Rd', 'Gr', 'Ct', 'Dr', 'Pde', 'Pl', 'Cl', 'Cr',
       'Av'], dtype=object)

In [160]:
melb_df['WeekdaySale'] = melb_df['Date'].dt.weekday

def get_weekend(weekday):
    if weekday in [5, 6]:
        return 1    
    return 0

melb_df['Weekend'] = melb_df['WeekdaySale'].apply(get_weekend)

melb_df[melb_df['Weekend'] == 1]['Price'].mean()

1081198.6406956792

In [161]:
popular_sellers = melb_df['SellerG'].value_counts().nlargest(49).index

def isSelletPopular(seller):
    if seller in popular_sellers:
        return seller
    return 'other'

melb_df['SellerG'] = melb_df['SellerG'].apply(isSelletPopular)

melb_df

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom,...,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,MeanRoomsArea,AreaRatio,StreetType,WeekdaySale,Weekend
0,Abbotsford,2,h,1480000.0,S,Biggin,2016-12-03,2.5,3067,2,...,Yarra,-37.79960,144.99840,Northern Metropolitan,4019,25.200000,-0.231707,St,5,1
1,Abbotsford,2,h,1035000.0,S,Biggin,2016-02-04,2.5,3067,2,...,Yarra,-37.80790,144.99340,Northern Metropolitan,4019,15.800000,-0.327660,St,3,0
2,Abbotsford,3,h,1465000.0,SP,Biggin,2017-03-04,2.5,3067,3,...,Yarra,-37.80930,144.99440,Northern Metropolitan,4019,18.750000,0.056338,St,5,1
3,Abbotsford,3,h,850000.0,PI,Biggin,2017-03-04,2.5,3067,3,...,Yarra,-37.79690,144.99690,Northern Metropolitan,4019,15.750000,0.145455,other,5,1
4,Abbotsford,4,h,1600000.0,VB,Nelson,2016-06-04,2.5,3067,3,...,Yarra,-37.80720,144.99410,Northern Metropolitan,4019,17.750000,0.083969,St,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,4,h,1245000.0,S,Barry,2017-08-26,16.7,3150,4,...,,-37.90562,145.16761,South-Eastern Metropolitan,7392,12.600000,-0.676093,Cr,5,1
13576,Williamstown,3,h,1031000.0,SP,Williams,2017-08-26,6.8,3016,3,...,,-37.85927,144.87904,Western Metropolitan,6380,16.625000,-0.429185,Dr,5,1
13577,Williamstown,3,h,1170000.0,S,Raine,2017-08-26,6.8,3016,3,...,,-37.85274,144.88738,Western Metropolitan,6380,15.750000,-0.551601,St,5,1
13578,Williamstown,4,h,2500000.0,PI,Sweeney,2017-08-26,6.8,3016,4,...,,-37.85908,144.89299,Western Metropolitan,6380,17.444444,-0.693060,St,5,1
