# Análisis de precios de casas
- ¿Qué factores afectan al precio de las casas?
    - ¿Qué factores incrementan el precio y en qué medida?
    - ¿QUé factores decrementan el precio y en qué medida?
- ¿Podemos predecir el precio de una casa?

# Data Understanding


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

In [2]:
house_data = pd.read_csv('melb_data.csv') 

In [3]:
house_data.shape

(13580, 21)

In [4]:
house_data.columns.values

array(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea',
       'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'],
      dtype=object)

In [5]:
house_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [6]:
house_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [7]:
# How old is the newest home?
newest_home_age = 2022 - house_data['YearBuilt'].max()
# How old is the oldest home?
oldest_home_age = 2022 - house_data['YearBuilt'].min()

print("Newest home age: {0}".format(newest_home_age))
print("Old home age: {0}".format(oldest_home_age))

Newest home age: 4.0
Old home age: 826.0


In [8]:
house_data.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [9]:
house_data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [10]:
missing = (house_data.isnull().sum())*100/house_data.shape[0]
missing

Suburb            0.000000
Address           0.000000
Rooms             0.000000
Type              0.000000
Price             0.000000
Method            0.000000
SellerG           0.000000
Date              0.000000
Distance          0.000000
Postcode          0.000000
Bedroom2          0.000000
Bathroom          0.000000
Car               0.456554
Landsize          0.000000
BuildingArea     47.496318
YearBuilt        39.580265
CouncilArea      10.081001
Lattitude         0.000000
Longtitude        0.000000
Regionname        0.000000
Propertycount     0.000000
dtype: float64

In [11]:
# Faltan muchos datos en 'BuildingArea', 'YearBuilt', 'CouncilArea' y unos pocos en 'Car'
house_data[['Car','BuildingArea','YearBuilt','CouncilArea']]

Unnamed: 0,Car,BuildingArea,YearBuilt,CouncilArea
0,1.0,,,Yarra
1,0.0,79.0,1900.0,Yarra
2,0.0,150.0,1900.0,Yarra
3,1.0,,,Yarra
4,2.0,142.0,2014.0,Yarra
...,...,...,...,...
13575,2.0,,1981.0,
13576,2.0,133.0,1995.0,
13577,4.0,,1997.0,
13578,5.0,157.0,1920.0,


In [12]:
# Dropeamos las rows que no tengan council area dado que no representa un porcentaje elevado
house_data = house_data.dropna(subset=['CouncilArea'])

In [13]:
house_data['CouncilArea']

0              Yarra
1              Yarra
2              Yarra
3              Yarra
4              Yarra
            ...     
12208    Hobsons Bay
12209    Stonnington
12210     Whittlesea
12211    Maribyrnong
12212    Maribyrnong
Name: CouncilArea, Length: 12211, dtype: object

In [14]:
y = house_data.Price.copy()

# drop 'Price' column for the features
house_data.drop('Price', axis=1, inplace=True)

In [15]:
numerical_cols = [cname for cname in house_data.columns if house_data[cname].dtype in ['int64', 'float64']]

In [16]:
categorical_cols = [cname for cname in house_data.columns if house_data[cname].dtype in ['object']]

In [17]:
my_cols = categorical_cols + numerical_cols
X = house_data[my_cols].copy()

In [18]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [19]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(n_estimators=100,random_state=0))
                             ])

scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error', error_score='raise')


In [20]:
print("Average score:\n", scores.mean())

Average score:
 178769.48714086748
