In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [12]:
#---------------------------------------------------------
# DELETE TRASH COLUMNS
#---------------------------------------------------------
def deleteTrashColumns(df):
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df
#---------------------------------------------------------
# PROCESS TRAIN SET
#---------------------------------------------------------
def ProcessTrainSet(df, withPrice):
    df.dropna(axis=0, how='any', subset=list(df.columns.values), inplace=True)
    
    target = df['price_aprox_usd'].tolist()
    if withPrice:
        df.drop('price_aprox_usd', axis = 1, inplace = True)
    
    columns = list(df.columns.values)
    data = list(df.values)
    return data, target
#---------------------------------------------------------
# GET DATA AS LIST
#---------------------------------------------------------
def GetDataAsList(df):
    columns = list(df.columns.values)
    data = list(df.values)
    return data
#---------------------------------------------------------
# PRINT SCORE
#---------------------------------------------------------
def printScore(prediction, target):
    err = mean_squared_error(prediction, target)
    print "mean_squared_error: ", err
    digitsQuantity = len(str(err).split('.')[0])
    print "digitsQuantity: ", digitsQuantity
#---------------------------------------------------------
# SAVE FINAL DF
#---------------------------------------------------------
def saveFinalDF(predictions, ids):
    aData = {'price_usd': predictions}
    final = pd.DataFrame(data = aData, index=ids)
    final.to_csv("properati_dataset_sample_submision.csv", \
    index = True, header = True, sep = ',', encoding = 'utf-8-sig')
    return final
#---------------------------------------------------------
# FIT AND GET INFO
#---------------------------------------------------------
def fitAndGetInfo(model, xTrain, yTrain, xTest, yTest):
    model.fit(xTrain, yTrain)
    prediction = model.predict(xTest)
    print "precision test: ", model.score(xTest, yTest)
    print "best params: ", model.best_params_
    print "best score: ", model.best_score_ 
    print "best estimator: ", model.best_estimator_
    printScore(prediction, yTest)

# Set de entrenamiento

In [5]:
df = pd.read_csv("dataTrain.csv", low_memory = False)
df = deleteTrashColumns(df)
dataTrain, targetTrain = ProcessTrainSet(df, True)

In [6]:
columns = list(df.columns.values)
print "data size: ", len(dataTrain)
print "target size: ", len(targetTrain)
print "columns size: ", len(columns)

data size:  248616
target size:  248616
columns size:  8


# Set de test

In [7]:
predictDf = pd.read_csv("dataTest.csv", low_memory = False)
ids = predictDf['id']
predictDf.drop('id', axis = 1, inplace = True)
predictDf = deleteTrashColumns(predictDf)
data = GetDataAsList(predictDf)

In [8]:
columns = list(predictDf.columns.values)
print "data size: ", len(data)
print "df size: ", len(predictDf.index)
print "columns size: ", len(columns)
#row should be 14166

data size:  14166
df size:  14166
columns size:  8


# Split

In [9]:
xTrain, xTest, yTrain, yTest = train_test_split(dataTrain, targetTrain, test_size=0.5, random_state=0)

# Prediccion 

# Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression

In [34]:
params = {
    'normalize': [True, False],
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [i for i in range(1, 10)]
}
LRGS = GridSearchCV(LinearRegression(), params, cv=3, n_jobs=5, verbose=1, scoring=None, refit=True)

In [35]:
fitAndGetInfo(LRGS, xTrain, yTrain, xTest, yTest)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  4.3min
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed: 20.3min
[Parallel(n_jobs=5)]: Done 216 out of 216 | elapsed: 23.1min finished


precision test:  0.315472319773
best params:  {'copy_X': True, 'normalize': True, 'n_jobs': 1, 'fit_intercept': True}
best score:  -0.434406332092
best estimator:  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
mean_squared_error:  106800867896.0
digitsQuantity:  12


In [36]:
lrPrediciton = lr.predict(data)
final = saveFinalDF(lrPrediciton, ids)

In [37]:
final.head()

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,-284336.8
3633,-4285213.0
2263404,-563177.6
2263405,-769942.7
2263406,-693813.2


# Transformacion no lineal a lineal

#  (Este metodo da precios demasiado altos y es peor que el lineal hecho en el paso anterior)

Si tomamos como ejemplo una función f que toma la forma :  f(x) = a + bx + cx²

La función f es no lineal en función de x pero si es lineal en función de los parámetros desconocidos a, b, y c. O visto de otra manera: podemos sustituir nuestras variables x por un array z tal que: z = [1, x, x²]. Con el que podríamos reescribir nuestra función f como f(z) = a z0 + bz1 + c*z2

Scikit-learn tiene un objeto PolynomialFeatures que nos va a servir para convertir nuestra variable x en un array z del tipo z = [1, x, x2, …, n^n], que es lo que nos interesa.

El resultado de esa transformación se la pasamos a nuestro modelo Ridge. Para facilitar la tarea en este tipo de casos —donde se realizan varios pasos que van desde el pre-tratamiento de los datos hasta un posible post-tratamiento pasando por el entrenamiento—, podemos hacer uso de las Pipeline que nos permiten encadenar multiples estimadores en uno. Esto es especialmente útil cuando hay secuencia de pasos predefinidos en el procesado de datos con, por ejemplo, selección de atributos, normalización y clasificación.

In [186]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [None]:
params = {
    'normalize': [True, False],
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [i for i in range(1, 10)]
}
PFGS = GridSearchCV(LinearRegression(), params, cv=3, n_jobs=5, verbose=1, scoring=None, refit=True)

In [None]:
poly = PolynomialFeatures(degree = 2)
zTrain = poly.fit_transform(xTrain)
zTest = poly.fit_transform(xTest)

In [None]:
fitAndGetInfo(PFGS, zTrain, yTrain, zTest, yTest)

In [189]:
DataTransformed = poly.fit_transform(data)
predictions = PFGS.predict(DataTransformed)
final = saveFinalDF(predictions, ids)

In [190]:
final.head()

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,-1.30915e+22
3633,-2.594522e+23
2263404,-1.63558e+22
2263405,-1.460626e+22
2263406,-1.452099e+22


# Decicion Tree

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [14]:
columns = list(predictDf.columns.values)
maxFeatures = len(columns)
params = {
    'max_depth': [5, 15, 25, 35, 45, 55, 67, 76, 85, 95, 100],
    #'criterion': ["mse", "friedman_mse", "mae"]
    'max_features': [i for i in range(1, maxFeatures)]
}
DTGS = GridSearchCV(DecisionTreeRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [15]:
fitAndGetInfo(DTGS, xTrain, yTrain, xTest, yTest)

Fitting 3 folds for each of 77 candidates, totalling 231 fits


[Parallel(n_jobs=1)]: Done 231 out of 231 | elapsed:  2.0min finished


precision test:  0.816261338573
best params:  {'max_features': 6, 'max_depth': 95}
best score:  0.903296117411
best estimator:  DecisionTreeRegressor(criterion='mse', max_depth=95, max_features=6,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
mean_squared_error:  28667136586.9
digitsQuantity:  11


In [26]:
predictionDTR = DTGS.predict(data)
final = saveFinalDF(predictionDTR, ids)

In [27]:
final
#3632: $410000

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,1300000.0
3633,3600000.0
2263404,2600000.0
2263405,2600000.0
2263406,2600000.0
2263407,2600000.0
2263408,2600000.0
2263409,2600000.0
2263410,2600000.0
2263411,475000.0


# Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [22]:
columns = list(predictDf.columns.values)
maxFeatures = len(columns)
params = {
    'max_depth': [5, 15, 25, 35, 45, 55, 67, 76, 85, 95, 100],
    #'min_samples_leaf': [3, 5, 9],
    'max_features': [i for i in range(1, maxFeatures)]
}
RFGS = GridSearchCV(RandomForestRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [23]:
fitAndGetInfo(RFGS, xTrain, yTrain, xTest, yTest)

Fitting 3 folds for each of 77 candidates, totalling 231 fits


[Parallel(n_jobs=1)]: Done 231 out of 231 | elapsed:  8.9min finished


precision test:  0.818439838578
best params:  {'max_features': 2, 'max_depth': 76}
best score:  0.919450234064
best estimator:  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=76,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
mean_squared_error:  28327244281.6
digitsQuantity:  11


In [24]:
RFPrediction = RFGS.predict(data)
final = saveFinalDF(RFPrediction, ids)

In [25]:
final

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,2104800.0
3633,999000.0
2263404,1777500.0
2263405,1910000.0
2263406,1910000.0
2263407,1910000.0
2263408,1910000.0
2263409,1910000.0
2263410,1565000.0
2263411,670900.0
