In [179]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

In [180]:
#---------------------------------------------------------
# DELETE TRASH COLUMNS
#---------------------------------------------------------
def deleteTrashColumns(df):
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df
#---------------------------------------------------------
# PROCESS TRAIN SET
#---------------------------------------------------------
def ProcessTrainSet(df, withPrice):
    df.dropna(axis=0, how='any', subset=list(df.columns.values), inplace=True)
    
    target = df['price_aprox_usd'].tolist()
    if withPrice:
        df.drop('price_aprox_usd', axis = 1, inplace = True)
    
    columns = list(df.columns.values)
    data = list(df.values)
    return data, target
#---------------------------------------------------------
# GET DATA AS LIST
#---------------------------------------------------------
def GetDataAsList(df):
    columns = list(df.columns.values)
    data = list(df.values)
    return data
#---------------------------------------------------------
# PRINT SCORE
#---------------------------------------------------------
def printScore(prediction, target):
    err = mean_squared_error(prediction, target)
    print "mean_squared_error: ", err
    digitsQuantity = len(str(err).split('.')[0])
    print "digitsQuantity: ", digitsQuantity
#---------------------------------------------------------
# SAVE FINAL DF
#---------------------------------------------------------
def saveFinalDF(predictions, ids):
    aData = {'price_usd': predictions}
    final = pd.DataFrame(data = aData, index=ids)
    final.to_csv("properati_dataset_sample_submision.csv", \
    index = True, header = True, sep = ',', encoding = 'utf-8-sig')
    return final

# Set de entrenamiento

In [181]:
df = pd.read_csv("dataTrain.csv", low_memory = False)
df = deleteTrashColumns(df)
dataTrain, targetTrain = ProcessTrainSet(df, True)

In [182]:
columns = list(df.columns.values)
print "data size: ", len(dataTrain)
print "target size: ", len(targetTrain)
print "columns size: ", len(columns)

data size:  248616
target size:  248616
columns size:  22


# Set de test

In [183]:
predictDf = pd.read_csv("dataTest.csv", low_memory = False)
ids = predictDf['id']
predictDf.drop('id', axis = 1, inplace = True)
predictDf = deleteTrashColumns(predictDf)
data = GetDataAsList(predictDf)

In [184]:
columns = list(predictDf.columns.values)
print "data size: ", len(data)
print "df size: ", len(predictDf.index)
print "columns size: ", len(columns)
#row should be 14166

data size:  14166
df size:  14166
columns size:  22


# Split

In [185]:
xTrain, xTest, yTrain, yTest = train_test_split(dataTrain, targetTrain, test_size=0.5, random_state=0)

# Prediccion 

# Linear Regression

In [50]:
from sklearn.linear_model import LinearRegression

In [13]:
lr = LinearRegression(normalize = True)
lr.fit(xTrain, yTrain)
print "train score: ", lr.score(xTrain, yTrain)
yPredictionTest = lr.predict(xTest)

train score:  0.21847958081


In [14]:
printScore(yPredictionTest, yTest)

mean_squared_error:  107624412685.0
digitsQuantity:  12


In [15]:
lrPrediciton = lr.predict(data)
final = saveFinalDF(lrPrediciton, ids)

In [16]:
final.head()

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,88724590.0
3633,1000604000.0
2263404,63869230.0
2263405,118026000.0
2263406,118003900.0


# Transformacion no lineal a lineal

#  (Este metodo da precios demasiado altos y es peor que el lineal hecho en el paso anterior)

Si tomamos como ejemplo una función f que toma la forma :  f(x) = a + bx + cx²

La función f es no lineal en función de x pero si es lineal en función de los parámetros desconocidos a, b, y c. O visto de otra manera: podemos sustituir nuestras variables x por un array z tal que: z = [1, x, x²]. Con el que podríamos reescribir nuestra función f como f(z) = a z0 + bz1 + c*z2

Scikit-learn tiene un objeto PolynomialFeatures que nos va a servir para convertir nuestra variable x en un array z del tipo z = [1, x, x2, …, n^n], que es lo que nos interesa.

El resultado de esa transformación se la pasamos a nuestro modelo Ridge. Para facilitar la tarea en este tipo de casos —donde se realizan varios pasos que van desde el pre-tratamiento de los datos hasta un posible post-tratamiento pasando por el entrenamiento—, podemos hacer uso de las Pipeline que nos permiten encadenar multiples estimadores en uno. Esto es especialmente útil cuando hay secuencia de pasos predefinidos en el procesado de datos con, por ejemplo, selección de atributos, normalización y clasificación.

In [186]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [187]:
poly = PolynomialFeatures(degree = 2)
z = poly.fit_transform(xTrain)
lr = LinearRegression(normalize=True)
lr.fit(z, yTrain)
lr.score(z, yTrain)

0.47584163502095844

In [188]:
zTest = poly.fit_transform(xTest)
predictionsSamplePlynomial = lr.predict(zTest)
print "precision sample: ", lr.score(zTest, yTest)

precision sample:  -5283.84172093


In [189]:
DataTransformed = poly.fit_transform(data)
predictions = lr.predict(DataTransformed)
final = saveFinalDF(predictions, ids)

In [190]:
final.head()

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,-1.30915e+22
3633,-2.594522e+23
2263404,-1.63558e+22
2263405,-1.460626e+22
2263406,-1.452099e+22


# Decicion Tree

In [191]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [202]:
columns = list(predictDf.columns.values)
maxFeatures = len(columns)
params = {
    'max_depth': [i for i in range(5, 100)],
    #'criterion': ["mse", "friedman_mse", "mae"]
    'max_features': [i for i in range(1, maxFeatures)]
}
gs = GridSearchCV(DecisionTreeRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [None]:
gs.fit(xTrain, yTrain)
yTestPrediction = gs.predict(xTest)
print "precision test: ", gs.score(xTest, yTest)
print "best params: ", gs.best_params_
print "best score: ", gs.best_score_ 
print "best estimator: ", gs.best_estimator_

Fitting 3 folds for each of 1995 candidates, totalling 5985 fits


In [199]:
printScore(yTestPrediction, yTest)

mean_squared_error:  29804354126.8
digitsQuantity:  11


In [200]:
predictionDTR = gs.predict(data)
final = saveFinalDF(predictionDTR, ids)

In [201]:
final
#3632: $410000

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,76000.0
3633,76000.0
2263404,76000.0
2263405,76000.0
2263406,76000.0
2263407,76000.0
2263408,76000.0
2263409,76000.0
2263410,76000.0
2263411,76000.0


# Random Forest Regressor

In [78]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [79]:
params = {
    'max_depth': [4,6],
    'min_samples_leaf': [3, 5, 9],
    'max_features': [1.0, 0.3, 0.1]
}
gs = GridSearchCV(RandomForestRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [80]:
gs.fit(xTrain, yTrain)
print "score train: ", gs.score(xTrain, yTrain)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:   57.6s finished


score train:  0.730790922777


In [81]:
yTestPredictionRF = gs.predict(xTest)
print "score test: ", gs.score(xTest, yTest)

score test:  0.634321865073


In [82]:
printScore(yTestPredictionRF, yTest)

mean_squared_error:  57053561615.0
digitsQuantity:  11


In [83]:
RFPrediction = gs.predict(data)
final = saveFinalDF(RFPrediction, ids)

In [84]:
final.head()

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,2748631.0
3633,2326602.0
2263404,1693606.0
2263405,2748631.0
2263406,2748631.0
