In [80]:
import pandas as pd

In [81]:
df = pd.read_csv("./Property Sales of Melbourne City.csv",index_col=0)

- Suburb: Name of the suburb where the property is located
- Address: Street address of the property
- Rooms: Number of rooms in the property (excluding bathrooms and other non-living spaces)
- Price: Sale price of the property in Australian dollars (AUD)
- Method: Method of sale (e.g., S = property sold, SP = property sold prior, PI = property passed in, PN = sold prior not disclosed, SN = sold not disclosed, VB = vendor bid, W = withdrawn prior to auction, SA = sold after auction)
- Type: Type of property (e.g., h = house, t = townhouse, u = unit/apartment)
- SellerG: Real estate agency or agent handling the sale
- Date: Date of the sale
- Distance: Distance from the property to Melbourne central business district (CBD) in kilometers
- Regionname: Name of the region where the property is located (e.g., Eastern Metropolitan, Northern Metropolitan, Southern Metropolitan, Western Metropolitan)
- Propertycount: Number of properties that exist in the suburb
- Bedroom2: Number of bedrooms in the property (including any non-living spaces that could be used as bedrooms)
- Bathroom: Number of bathrooms in the property
- Car: Number of car spaces associated with the property
- Landsize: Land size of the property in square meters
- BuildingArea: Total building area of the property in square meters
- CouncilArea: Name of the local government area where the property is located

In [82]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
6,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18396 entries, 1 to 23546
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         18396 non-null  object 
 1   Address        18396 non-null  object 
 2   Rooms          18396 non-null  int64  
 3   Type           18396 non-null  object 
 4   Price          18396 non-null  int64  
 5   Method         18396 non-null  object 
 6   SellerG        18396 non-null  object 
 7   Date           18396 non-null  object 
 8   Distance       18395 non-null  float64
 9   Postcode       18395 non-null  float64
 10  Bedroom2       14927 non-null  float64
 11  Bathroom       14925 non-null  float64
 12  Car            14820 non-null  float64
 13  Landsize       13603 non-null  float64
 14  BuildingArea   7762 non-null   float64
 15  YearBuilt      8958 non-null   float64
 16  CouncilArea    12233 non-null  object 
 17  Lattitude      15064 non-null  float64
 18  Longtitude 

In [84]:
df.isna().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price                0
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          3469
Bathroom          3471
Car               3576
Landsize          4793
BuildingArea     10634
YearBuilt         9438
CouncilArea       6163
Lattitude         3332
Longtitude        3332
Regionname           1
Propertycount        1
dtype: int64

In [85]:
col_to_use = ['Suburb','Rooms','Type','Method',"SellerG","Regionname","Propertycount","Distance","CouncilArea","Bedroom2","Bathroom","Car","Landsize","BuildingArea","Price"]

dataset = df[col_to_use]
dataset['CouncilArea'].unique()

array(['Yarra', nan, 'Moonee Valley', 'Port Phillip', 'Darebin',
       'Hobsons Bay', 'Stonnington', 'Boroondara', 'Monash', 'Glen Eira',
       'Whitehorse', 'Maribyrnong', 'Bayside', 'Moreland', 'Manningham',
       'Banyule', 'Melbourne', 'Kingston', 'Brimbank', 'Hume', 'Knox',
       'Maroondah', 'Casey', 'Melton', 'Greater Dandenong', 'Nillumbik',
       'Whittlesea', 'Frankston', 'Macedon Ranges', 'Yarra Ranges',
       'Wyndham', 'Cardinia', 'Unavailable', 'Moorabool'], dtype=object)

In [86]:
dataset['Landsize'] = dataset['Landsize'].fillna(dataset['Landsize'].mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset['BuildingArea'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Landsize'] = dataset['Landsize'].fillna(dataset['Landsize'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset['BuildingArea'].mean())


In [87]:
col_to_zero = ['Propertycount','Distance','Bedroom2','Bathroom','Car']
dataset[col_to_zero] = dataset[col_to_zero].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[col_to_zero] = dataset[col_to_zero].fillna(0)


In [88]:
dataset.dropna(subset=['Regionname'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.dropna(subset=['Regionname'],inplace=True)


In [89]:
dataset['CouncilArea'].ffill(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['CouncilArea'].ffill(inplace=True)


In [90]:
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [91]:
dataset = pd.get_dummies(dataset,drop_first=True)

In [92]:
dataset.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moreland,CouncilArea_Nillumbik,CouncilArea_Port Phillip,CouncilArea_Stonnington,CouncilArea_Unavailable,CouncilArea_Whitehorse,CouncilArea_Whittlesea,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,151.220219,1480000,False,...,False,False,False,False,False,False,False,False,True,False
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000,False,...,False,False,False,False,False,False,False,False,True,False
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000,False,...,False,False,False,False,False,False,False,False,True,False
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,151.220219,850000,False,...,False,False,False,False,False,False,False,False,True,False
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000,False,...,False,False,False,False,False,False,False,False,True,False


In [93]:
from sklearn.model_selection import train_test_split

X = dataset.drop(columns=['Price'])
Y = dataset['Price']

X_Train,X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [94]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [95]:
model.fit(X_Train,y_train)

In [96]:
model.score(X_test,y_test)

0.6627938304117118

### Regularization

In [97]:
from sklearn.linear_model import Lasso

In [98]:
lasso_reg = Lasso(alpha=50,max_iter=100,tol=0.1)

lasso_reg.fit(X_Train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [100]:
lasso_reg.score(X_Train,y_train)

0.6843687814875088

In [99]:
lasso_reg.score(X_test,y_test)

0.668178826878115