In [68]:
import pandas as pd
import numpy as np
from os import listdir
from sklearn import datasets
from sklearn.linear_model import LinearRegression
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

In [69]:
#---------------------------------------------------------
# DELETE TRASH COLUMNS
#---------------------------------------------------------
def deleteTrashColumns(df):
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df
#---------------------------------------------------------
# PROCESS TRAIN SET
#---------------------------------------------------------
def ProcessTrainSet(df, withPrice):
    df.dropna(axis=0, how='any', subset=list(df.columns.values), inplace=True)
    
    target = df['price_aprox_usd'].tolist()
    if withPrice:
        df.drop('price_aprox_usd', axis = 1, inplace = True)
    
    columns = list(df.columns.values)
    data = list(df.values)
    return data, target
#---------------------------------------------------------
# GET DATA AS LIST
#---------------------------------------------------------
def GetDataAsList(df):
    columns = list(df.columns.values)
    data = list(df.values)
    return data
#---------------------------------------------------------
# SAVE FINAL DF
#---------------------------------------------------------
def saveFinalDF(predictions, ids):
    aData = {'price_usd': predictions}
    final = pd.DataFrame(data = aData, index=ids)
    final.to_csv("properati_dataset_sample_submision.csv", \
    index = True, header = True, sep = ',', encoding = 'utf-8-sig')
    return final

# Set de entrenamiento

In [70]:
df = pd.read_csv("dataTrain.csv", low_memory = False)
df = deleteTrashColumns(df)
dataTrain, targetTrain = ProcessTrainSet(df, True)

In [71]:
columns = list(df.columns.values)
print "data size: ", len(dataTrain)
print "target size: ", len(targetTrain)
print "columns size: ", len(columns)

data size:  770550
target size:  770550
columns size:  12


# Set de test

In [72]:
predictDf = pd.read_csv("dataTest.csv", low_memory = False)
ids = predictDf['id']
predictDf.drop('id', axis = 1, inplace = True)
predictDf = deleteTrashColumns(predictDf)
data = GetDataAsList(predictDf)

In [73]:
columns = list(predictDf.columns.values)
print "data size: ", len(data)
print "df size: ", len(predictDf.index)
print "columns size: ", len(columns)
#row should be 14166

data size:  14166
df size:  14166
columns size:  12


# Split

In [74]:
xTrain, xTest, yTrain, yTest = train_test_split(dataTrain, targetTrain, test_size=0.5, random_state=0)

# Prediccion 

# Linear Regression

In [None]:
lr = LinearRegression(normalize = True)
lr.fit(xTrain, yTrain)
print "train score: ", lr.score(xTrain, yTrain)
yPredictionTest = lr.prediction(xTest)

In [None]:
err = mean_squared_error(yPredictionTest, yTest)
print "mean_squared_error: ", err
digitsQuantity = len(str(err).split('.')[0])
print "digitsQuantity: ", digitsQuantity

In [None]:
lrPrediciton = lr.predict(data)
final = saveFinalDF(lrPrediciton, ids)

# Transformacion no lineal a lineal

#  (Este metodo da precios demasiado altos y es peor que el lineal hecho en el paso anterior)

Si tomamos como ejemplo una función f que toma la forma :  f(x) = a + bx + cx²

La función f es no lineal en función de x pero si es lineal en función de los parámetros desconocidos a, b, y c. O visto de otra manera: podemos sustituir nuestras variables x por un array z tal que: z = [1, x, x²]. Con el que podríamos reescribir nuestra función f como f(z) = a z0 + bz1 + c*z2

Scikit-learn tiene un objeto PolynomialFeatures que nos va a servir para convertir nuestra variable x en un array z del tipo z = [1, x, x2, …, n^n], que es lo que nos interesa.

El resultado de esa transformación se la pasamos a nuestro modelo Ridge. Para facilitar la tarea en este tipo de casos —donde se realizan varios pasos que van desde el pre-tratamiento de los datos hasta un posible post-tratamiento pasando por el entrenamiento—, podemos hacer uso de las Pipeline que nos permiten encadenar multiples estimadores en uno. Esto es especialmente útil cuando hay secuencia de pasos predefinidos en el procesado de datos con, por ejemplo, selección de atributos, normalización y clasificación.

In [1]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [None]:
poly = PolynomialFeatures(degree = 2)
z = poly.fit_transform(xTrain)
lr = LinearRegression(normalize=True)
lr.fit(z, yTrain)
lr.score(z, yTrain)

In [None]:
zTest = poly.fit_transform(xTest)
predictionsSamplePlynomial = lr.predict(zTest)
print "precision sample: ", lr.score(zTest, yTest)

In [None]:
DataTransformed = poly.fit_transform(data)
predictions = lr.predict(DataTransformed)
final = saveFinalDF(predictions, ids)

In [None]:
final.head()

# Decicion Tree

In [75]:
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [84]:
params = {
    'max_depth': [i for i in range(50, 100)],
    #'criterion': ["mse", "friedman_mse", "mae"],
}
gs = GridSearchCV(DecisionTreeRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [77]:
gs.fit(xTrain, yTrain)
yTestPrediction = gs.predict(xTest)
print "precision test: ", gs.score(xTest, yTest)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 10.6min finished


precision test:  0.763251522169


In [78]:
err = mean_squared_error(yTestPrediction, yTest)
print "mean_squared_error: ", err
digitsQuantity = len(str(err).split('.')[0])
print "digitsQuantity: ", digitsQuantity

mean_squared_error:  31339148103.2
digitsQuantity:  11


In [79]:
predictionDTR = gs.predict(data)
final = saveFinalDF(predictionDTR, ids)

In [80]:
final
# 3632: $410000

Unnamed: 0_level_0,price_usd
id,Unnamed: 1_level_1
3632,1780000.0
3633,3200000.0
2263404,1150000.0
2263405,1780000.0
2263406,1780000.0
2263407,1780000.0
2263408,1780000.0
2263409,1780000.0
2263410,1780000.0
2263411,1780000.0
