In [1]:
# import dependencies
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load data CSV
file_path = Path('Resources/reporteTecno.csv')
prices_df = pd.read_csv(file_path, encoding="ISO-8859-1")
prices_df.head()

Unnamed: 0,id,entidad,colonia,municipio,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,banos,medio_bano,Baños,estacionamientos,pisos,antiguedad,cuotaMantto,precioSalida,precioVenta
0,10293,CIUDAD DE MEXICO,Del Valle Norte,Benito JuÃ¡rez,3103,TERRENO,421.0,,,,,0.0,,,,,20000000,17000000
1,11382,CIUDAD DE MEXICO,Mixcoac,Benito JuÃ¡rez,3910,TERRENO,385.12,,,,,0.0,,,Mas de 30,,18500000,17000000
2,13132,CIUDAD DE MEXICO,Portales Sur,Benito JuÃ¡rez,3300,CASA,207.0,230.0,4.0,2.0,,2.0,3.0,1.0,Mas de 30,,6800000,6000000
3,17107,CIUDAD DE MEXICO,Jardines de CoyoacÃ¡n,CoyoacÃ¡n,4890,CASA,482.0,637.0,4.0,5.0,1.0,5.5,6.0,3.0,ENTRE 21 - 30 AÃOS,,15000000,14200000
4,17799,CIUDAD DE MEXICO,Paulino Navarro,CuauhtÃ©moc,6870,EDIFICIO,542.0,542.0,,,,0.0,,3.0,Mas de 30,,62330000,62330000


In [3]:
prices_df.dtypes

id                    int64
entidad              object
colonia              object
municipio            object
codigo_postal         int64
tipo                 object
m2Terreno            object
m2Construccion      float64
recamaras           float64
banos               float64
medio_bano          float64
Baños               float64
estacionamientos    float64
pisos               float64
antiguedad           object
cuotaMantto          object
precioSalida          int64
precioVenta           int64
dtype: object

In [4]:
prices_df = prices_df.drop(["id", "entidad", "colonia", "municipio", "estacionamientos", "antiguedad", "cuotaMantto", "banos", "medio_bano", "precioSalida"], axis=1)
prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,TERRENO,421.0,,,0.0,,17000000
1,3910,TERRENO,385.12,,,0.0,,17000000
2,3300,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,EDIFICIO,542.0,542.0,,0.0,3.0,62330000


In [5]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column codigo_postal has 0 null values
Column tipo has 0 null values
Column m2Terreno has 691 null values
Column m2Construccion has 90 null values
Column recamaras has 171 null values
Column Baños has 0 null values
Column pisos has 325 null values
Column precioVenta has 0 null values


In [6]:
tipo_counts = prices_df.tipo.value_counts()
tipo_counts

DEPARTAMENTO          660
CASA                  494
TERRENO                86
CASA EN CONDOMINIO     45
LOCAL COMERCIAL        28
EDIFICIO               23
OFICINA                16
PENT HOUSE             14
CASA DUPLEX            10
BODEGA                  8
Name: tipo, dtype: int64

In [7]:
# Combine similar terms
prices_df = prices_df.replace({"CASA DUPLEX": "CASA", "CASA EN CONDOMINIO": "CASA", "PENT HOUSE": "DEPARTAMENTO"})

In [8]:
tipo_counts = prices_df.tipo.value_counts()
tipo_counts

DEPARTAMENTO       674
CASA               549
TERRENO             86
LOCAL COMERCIAL     28
EDIFICIO            23
OFICINA             16
BODEGA               8
Name: tipo, dtype: int64

In [9]:
# Bin data
replace_tipo = list(tipo_counts[tipo_counts <50].index)
for tipo in replace_tipo:
    prices_df.tipo = prices_df.tipo.replace(tipo, "OTRO")
    
prices_df.tipo.value_counts()

DEPARTAMENTO    674
CASA            549
TERRENO          86
OTRO             75
Name: tipo, dtype: int64

In [10]:
prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,TERRENO,421.0,,,0.0,,17000000
1,3910,TERRENO,385.12,,,0.0,,17000000
2,3300,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,OTRO,542.0,542.0,,0.0,3.0,62330000


In [11]:
# Replace NaN with 0 recámaras
prices_df["recamaras"] = prices_df["recamaras"].fillna(0)
prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,TERRENO,421.0,,0.0,0.0,,17000000
1,3910,TERRENO,385.12,,0.0,0.0,,17000000
2,3300,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,OTRO,542.0,542.0,0.0,0.0,3.0,62330000


In [12]:
# Replace NaN with 1 pisos
prices_df["pisos"] = prices_df["pisos"].fillna(1)
prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,TERRENO,421.0,,0.0,0.0,1.0,17000000
1,3910,TERRENO,385.12,,0.0,0.0,1.0,17000000
2,3300,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,OTRO,542.0,542.0,0.0,0.0,3.0,62330000


In [13]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column codigo_postal has 0 null values
Column tipo has 0 null values
Column m2Terreno has 691 null values
Column m2Construccion has 90 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [14]:
# replace Nan in m2construccion with m2terreno
prices_df.m2Construccion.fillna(prices_df.m2Terreno, inplace=True)
prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,TERRENO,421.0,421.0,0.0,0.0,1.0,17000000
1,3910,TERRENO,385.12,385.12,0.0,0.0,1.0,17000000
2,3300,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,OTRO,542.0,542.0,0.0,0.0,3.0,62330000


In [15]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column codigo_postal has 0 null values
Column tipo has 0 null values
Column m2Terreno has 691 null values
Column m2Construccion has 4 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [16]:
# replace Nan in m2terreno with m2construccion
prices_df.m2Terreno.fillna(prices_df.m2Construccion, inplace=True)
prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,TERRENO,421.0,421.0,0.0,0.0,1.0,17000000
1,3910,TERRENO,385.12,385.12,0.0,0.0,1.0,17000000
2,3300,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,OTRO,542.0,542.0,0.0,0.0,3.0,62330000


In [17]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column codigo_postal has 0 null values
Column tipo has 0 null values
Column m2Terreno has 4 null values
Column m2Construccion has 4 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [18]:
# Drop null rows
prices_df = prices_df.dropna()

In [19]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column codigo_postal has 0 null values
Column tipo has 0 null values
Column m2Terreno has 0 null values
Column m2Construccion has 0 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [20]:
prices_df.m2Terreno = pd.to_numeric(prices_df.m2Terreno, 'coerce')
prices_df.m2Construccion = pd.to_numeric(prices_df.m2Construccion, 'coerce')

In [21]:
from sklearn.preprocessing import LabelEncoder
# Use labelencoder to convert tipo into integers
prices_df['tipo'] = LabelEncoder().fit_transform(prices_df['tipo'])

prices_df.head()

Unnamed: 0,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,3103,3,421.0,421.0,0.0,0.0,1.0,17000000
1,3910,3,385.12,385.12,0.0,0.0,1.0,17000000
2,3300,0,207.0,230.0,4.0,2.0,1.0,6000000
3,4890,0,482.0,637.0,4.0,5.5,3.0,14200000
4,6870,2,542.0,542.0,0.0,0.0,3.0,62330000


In [22]:
# Define features set
X = prices_df.copy()
X = X.drop('precioVenta', axis=1)

In [23]:
# Define target
y = prices_df['precioVenta'].values

In [24]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
# Creating the decision tree classifier instance
model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model
model = model.fit(X_train_scaled, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').