# Análisis de precios de casas
- ¿Qué factores afectan al precio de las casas?
    - ¿Qué factores incrementan el precio y en qué medida?
    - ¿QUé factores decrementan el precio y en qué medida?
- ¿Podemos predecir el precio de una casa?

# Data Understanding


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
house_data = pd.read_csv('melb_data.csv') 

In [None]:
house_data.shape

In [None]:
house_data.columns.values

In [None]:
house_data.head()

In [None]:
house_data.describe()

In [None]:
# How old is the newest home?
newest_home_age = 2022 - house_data['YearBuilt'].max()
# How old is the oldest home?
oldest_home_age = 2022 - house_data['YearBuilt'].min()

print("Newest home age: {0}".format(newest_home_age))
print("Old home age: {0}".format(oldest_home_age))

In [None]:
house_data.dtypes

In [None]:
house_data.isnull().sum()

In [None]:
missing = (house_data.isnull().sum())*100/house_data.shape[0]
missing

In [None]:
# Faltan muchos datos en 'BuildingArea', 'YearBuilt', 'CouncilArea' y unos pocos en 'Car'
house_data[['Car','BuildingArea','YearBuilt','CouncilArea']]

In [None]:
# Dropeamos las rows que no tengan council area dado que no representa un porcentaje elevado
house_data = house_data.dropna(subset=['CouncilArea'])

In [None]:
house_data['CouncilArea']

In [None]:
y = house_data.Price.copy()

# drop 'Price' column for the features
house_data.drop('Price', axis=1, inplace=True)

In [None]:
numerical_cols = [cname for cname in house_data.columns if house_data[cname].dtype in ['int64', 'float64']]

In [None]:
categorical_cols = [cname for cname in house_data.columns if house_data[cname].dtype in ['object']]

In [None]:
my_cols = categorical_cols + numerical_cols
X = house_data[my_cols].copy()

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(n_estimators=100,random_state=0))
                             ])

scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error', error_score='raise')


In [None]:
print("Average score:\n", scores.mean())