In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

from sklearn.decomposition import PCA

In [2]:
properties = pd.read_csv('trainPreProcess.csv')
testing = pd.read_csv('testPreProcess.csv')

In [3]:
corr = properties.corr()
corr.sort_values(["price"], ascending = False, inplace = True)
print(corr.price)

price                       1.000000e+00
rooms                       9.802429e-02
surface_total_in_m2         9.626074e-02
floor                       3.890244e-02
lat                         3.526365e-02
Unnamed: 0                  2.109986e-02
place_name_encoded          1.402869e-02
surface_covered_in_m2       3.587126e-03
distancia_minima_publico    1.648668e-14
distancia_minima_privado    2.767686e-15
distancia_obelisco         -2.193523e-04
distancia_minima_subte     -1.530052e-03
state_name_encoded         -9.959947e-03
lon                        -1.226738e-02
property_type_encoded      -3.582329e-02
Name: price, dtype: float64


In [4]:
#tomo pocos datos porque se hace muy lento el algoritmo si no (por ahora)
trainorig = properties.sample(2000)
testorig = testing

In [5]:
#quito unnamed:0 y created_on, y dejo price al final, id al principio
#train = train.loc[:, ['id', 'property_type_encoded', 'place_name_encoded', 'state_name_encoded', 'lat', 'lon', 'surface_covered_in_m2', 'surface_total_in_m2', 'floor', 'rooms', 'distancia_obelisco', 'distancia_minima_subte', 'distancia_minima_privado', 'distancia_minima_publico', 'price']]

#OPTIMIZACION: quito distancias, lon, state_name_encoded y property_type_encoded
#train = train.loc[:, ['id', 'place_name_encoded', 'lat', 'surface_covered_in_m2', 'surface_total_in_m2', 'floor', 'rooms', 'price']]
#test = test.loc[:, ['id', 'place_name_encoded', 'lat', 'surface_covered_in_m2', 'surface_total_in_m2', 'floor', 'rooms']]

In [6]:
train = trainorig.loc[:, ['property_type_encoded', 'place_name_encoded', 'state_name_encoded', 'lat', 'lon', 'surface_covered_in_m2', 'surface_total_in_m2', 'floor', 'rooms', 'distancia_obelisco', 'distancia_minima_subte', 'distancia_minima_privado', 'distancia_minima_publico']]

pca = PCA(n_components=5)
pca.fit(train)

PCA(copy=True, n_components=5, whiten=False)
train_5d = pca.transform(train)
train_5d = pd.DataFrame(train_5d)
train_5d.index = train.index
train_5d.columns = ['dato1','dato2','dato3','dato4','dato5']
train_5d['price'] = trainorig['price']
train_5d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 96280 to 100285
Data columns (total 6 columns):
dato1    2000 non-null float64
dato2    2000 non-null float64
dato3    2000 non-null float64
dato4    2000 non-null float64
dato5    2000 non-null float64
price    2000 non-null float64
dtypes: float64(6)
memory usage: 109.4 KB


In [7]:
test = testorig.loc[:, ['property_type_encoded', 'place_name_encoded', 'state_name_encoded', 'lat', 'lon', 'surface_covered_in_m2', 'surface_total_in_m2', 'floor', 'rooms', 'distancia_obelisco', 'distancia_minima_subte', 'distancia_minima_privado', 'distancia_minima_publico']]

pca = PCA(n_components=5)
pca.fit(test)

PCA(copy=True, n_components=5, whiten=False)
test_5d = pca.transform(test)
test_5d = pd.DataFrame(test_5d)
test_5d.index = test.index
test_5d.columns = ['dato1','dato2','dato3','dato4','dato5']
test_5d['id'] = testorig['id']
test_5d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14166 entries, 0 to 14165
Data columns (total 6 columns):
dato1    14166 non-null float64
dato2    14166 non-null float64
dato3    14166 non-null float64
dato4    14166 non-null float64
dato5    14166 non-null float64
id       14166 non-null int64
dtypes: float64(5), int64(1)
memory usage: 664.1 KB


In [8]:
#XGBoost funciona con matrices, no con Df
#train = train.as_matrix()
#test = test.as_matrix()

train = train_5d.as_matrix()
test = test_5d.as_matrix()

In [9]:
#OPTIMIZACION: quite n columnas
#X_train = train[:,0:14-n]
#Y_train = train[:,14-n+1]
#X_test = test[:,0:14-n]

X_train = train[:,0:4]
Y_train = train[:,5]
X_test = test[:,0:4]

In [10]:
#tomo los ids del test aparte para despues
#test_ids = X_test[:,0]
test_ids = test[:,5]

In [11]:
#quito los ids de los sets, XGBoost solo trabaja con floats e ints
#X_test = X_test[:,1:14]
#X_train = X_train[:,1:14]

#OPTIMIZACION: quite 3 columnas
#X_test = X_test[:,1:11]
#X_train = X_train[:,1:11]

In [12]:
#armo el modelo (va a tomarse su tiempo con mas de 1000)
model = XGBClassifier(n_estimators=100, max_depth=3)
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [18]:
predictions = model.predict(X_test)

In [19]:
#score = mean_squared_error(Y_test, predictions)
#score

In [20]:
#median_absolute_error(Y_test, predictions)

In [21]:
#armo un csv con id, precio real, y precio predicho por XGBoost
predictionsflat = predictions.flatten()
idsflat = test_ids.flatten()
submissionDf = pd.DataFrame()
submissionDf['id'] = pd.Series(test_ids.astype('int32'))
submissionDf['price_usd'] = pd.Series(predictionsflat)
submissionDf.set_index(['id'], inplace=True)
submissionDf.to_csv('predicciones_XGB.csv')

In [22]:
submissionDf.head(10)

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,66000.0
3633,95000.0
2263404,66000.0
2263405,55000.0
2263406,62000.0
2263407,55000.0
2263408,55000.0
2263409,55000.0
2263410,132000.0
2263411,85000.0
