In [1]:
# import dependencies
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.preprocessing import OneHotEncoder

In [27]:
# Load data CSV
file_path = Path('Resources/reporteTecno.csv')
prices_df = pd.read_csv(file_path, encoding="ISO-8859-1")
prices_df.head()

Unnamed: 0,id,entidad,colonia,municipio,codigo_postal,tipo,m2Terreno,m2Construccion,recamaras,banos,medio_bano,Baños,estacionamientos,pisos,antiguedad,cuotaMantto,precioSalida,precioVenta
0,10293,CIUDAD DE MEXICO,Del Valle Norte,Benito JuÃ¡rez,3103,TERRENO,421.0,,,,,0.0,,,,,20000000,17000000
1,11382,CIUDAD DE MEXICO,Mixcoac,Benito JuÃ¡rez,3910,TERRENO,385.12,,,,,0.0,,,Mas de 30,,18500000,17000000
2,13132,CIUDAD DE MEXICO,Portales Sur,Benito JuÃ¡rez,3300,CASA,207.0,230.0,4.0,2.0,,2.0,3.0,1.0,Mas de 30,,6800000,6000000
3,17107,CIUDAD DE MEXICO,Jardines de CoyoacÃ¡n,CoyoacÃ¡n,4890,CASA,482.0,637.0,4.0,5.0,1.0,5.5,6.0,3.0,ENTRE 21 - 30 AÃOS,,15000000,14200000
4,17799,CIUDAD DE MEXICO,Paulino Navarro,CuauhtÃ©moc,6870,EDIFICIO,542.0,542.0,,,,0.0,,3.0,Mas de 30,,62330000,62330000


In [28]:
prices_df.dtypes

id                    int64
entidad              object
colonia              object
municipio            object
codigo_postal         int64
tipo                 object
m2Terreno            object
m2Construccion      float64
recamaras           float64
banos               float64
medio_bano          float64
Baños               float64
estacionamientos    float64
pisos               float64
antiguedad           object
cuotaMantto          object
precioSalida          int64
precioVenta           int64
dtype: object

In [29]:
prices_df = prices_df.drop(["id", "entidad", "municipio", "codigo_postal", "estacionamientos", "antiguedad", "cuotaMantto", "banos", "medio_bano", "precioSalida"], axis=1)
prices_df.head()

Unnamed: 0,colonia,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,Del Valle Norte,TERRENO,421.0,,,0.0,,17000000
1,Mixcoac,TERRENO,385.12,,,0.0,,17000000
2,Portales Sur,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,Jardines de CoyoacÃ¡n,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,Paulino Navarro,EDIFICIO,542.0,542.0,,0.0,3.0,62330000


In [30]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column colonia has 1 null values
Column tipo has 0 null values
Column m2Terreno has 691 null values
Column m2Construccion has 90 null values
Column recamaras has 171 null values
Column Baños has 0 null values
Column pisos has 325 null values
Column precioVenta has 0 null values


In [31]:
tipo_counts = prices_df.tipo.value_counts()
tipo_counts

DEPARTAMENTO          660
CASA                  494
TERRENO                86
CASA EN CONDOMINIO     45
LOCAL COMERCIAL        28
EDIFICIO               23
OFICINA                16
PENT HOUSE             14
CASA DUPLEX            10
BODEGA                  8
Name: tipo, dtype: int64

In [32]:
# Combine similar terms
prices_df = prices_df.replace({"CASA DUPLEX": "CASA", "CASA EN CONDOMINIO": "CASA", "PENT HOUSE": "DEPARTAMENTO"})
tipo_counts = prices_df.tipo.value_counts()
tipo_counts

DEPARTAMENTO       674
CASA               549
TERRENO             86
LOCAL COMERCIAL     28
EDIFICIO            23
OFICINA             16
BODEGA               8
Name: tipo, dtype: int64

In [25]:
tipo_counts = prices_df.tipo.value_counts()
tipo_counts

1    671
0    548
3     85
2     75
Name: tipo, dtype: int64

In [33]:
# Bin data
replace_tipo = list(tipo_counts[tipo_counts <50].index)
for tipo in replace_tipo:
    prices_df.tipo = prices_df.tipo.replace(tipo, "OTRO")
    
prices_df.tipo.value_counts()

DEPARTAMENTO    674
CASA            549
TERRENO          86
OTRO             75
Name: tipo, dtype: int64

In [34]:
prices_df.head()

Unnamed: 0,colonia,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,Del Valle Norte,TERRENO,421.0,,,0.0,,17000000
1,Mixcoac,TERRENO,385.12,,,0.0,,17000000
2,Portales Sur,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,Jardines de CoyoacÃ¡n,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,Paulino Navarro,OTRO,542.0,542.0,,0.0,3.0,62330000


In [36]:
# Replace NaN with 0 recámaras
prices_df["recamaras"] = prices_df["recamaras"].fillna(0)
prices_df["pisos"] = prices_df["pisos"].fillna(1)
prices_df.head()

Unnamed: 0,colonia,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,Del Valle Norte,TERRENO,421.0,,0.0,0.0,1.0,17000000
1,Mixcoac,TERRENO,385.12,,0.0,0.0,1.0,17000000
2,Portales Sur,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,Jardines de CoyoacÃ¡n,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,Paulino Navarro,OTRO,542.0,542.0,0.0,0.0,3.0,62330000


In [37]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column colonia has 1 null values
Column tipo has 0 null values
Column m2Terreno has 691 null values
Column m2Construccion has 90 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [38]:
# replace Nan in m2construccion with m2terreno
prices_df.m2Construccion.fillna(prices_df.m2Terreno, inplace=True)
prices_df.m2Terreno.fillna(prices_df.m2Construccion, inplace=True)
prices_df.head()

Unnamed: 0,colonia,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,Del Valle Norte,TERRENO,421.0,421.0,0.0,0.0,1.0,17000000
1,Mixcoac,TERRENO,385.12,385.12,0.0,0.0,1.0,17000000
2,Portales Sur,CASA,207.0,230.0,4.0,2.0,1.0,6000000
3,Jardines de CoyoacÃ¡n,CASA,482.0,637.0,4.0,5.5,3.0,14200000
4,Paulino Navarro,OTRO,542.0,542.0,0.0,0.0,3.0,62330000


In [39]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column colonia has 1 null values
Column tipo has 0 null values
Column m2Terreno has 4 null values
Column m2Construccion has 4 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [40]:
prices_df.m2Terreno = pd.to_numeric(prices_df.m2Terreno, 'coerce')
prices_df.m2Construccion = pd.to_numeric(prices_df.m2Construccion, 'coerce')

In [42]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column colonia has 0 null values
Column tipo has 0 null values
Column m2Terreno has 0 null values
Column m2Construccion has 0 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [41]:
# Drop null rows
prices_df = prices_df.dropna()

In [19]:
# Find null values
for column in prices_df.columns:
    print(f'Column {column} has {prices_df[column].isnull().sum()} null values')

Column colonia has 0 null values
Column tipo has 0 null values
Column m2Terreno has 0 null values
Column m2Construccion has 0 null values
Column recamaras has 0 null values
Column Baños has 0 null values
Column pisos has 0 null values
Column precioVenta has 0 null values


In [43]:
from sklearn.preprocessing import LabelEncoder
# Use labelencoder to convert tipo into integers
prices_df['tipo'] = LabelEncoder().fit_transform(prices_df['tipo'])
prices_df['colonia'] = LabelEncoder().fit_transform(prices_df['colonia'])

prices_df.head()

Unnamed: 0,colonia,tipo,m2Terreno,m2Construccion,recamaras,Baños,pisos,precioVenta
0,79,3,421.0,421.0,0.0,0.0,1.0,17000000
1,207,3,385.12,385.12,0.0,0.0,1.0,17000000
2,266,0,207.0,230.0,4.0,2.0,1.0,6000000
3,151,0,482.0,637.0,4.0,5.5,3.0,14200000
4,264,2,542.0,542.0,0.0,0.0,3.0,62330000


In [46]:
# Define target
y = prices_df['precioVenta']
# Define features set
X = prices_df.drop(columns='precioVenta')

In [63]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [64]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1088, 7)
(273, 7)
(1088,)
(273,)


In [65]:
# Create StandardScaler instance
scaler = StandardScaler()
# Fit Standard Scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [66]:
# Creating the decision tree classifier instance
model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [67]:
# Making predictions
predictions = model.predict(X_test_scaled)

In [68]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.03296703296703297

In [None]:
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))