In [1]:
import pandas as pd
import numpy as np
from os import listdir
from sklearn import datasets
from sklearn.linear_model import LinearRegression
%matplotlib inline
import matplotlib.pyplot as plt

# Preproceso del set de entrenamiento

# Funciones del uso de informacion del campo de descripcion

In [2]:
def isFloat(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

In [3]:
false = 0
true = 1
# ------------------------------------------------------------------------------
# INICIALIZAR DICCIONARIO
# ------------------------------------------------------------------------------
# pre: Recibe una lista de claves
# pos: devuelve un diccionario de esas claves recibidas inicializadas en cero

def inicializar_diccionario(keys):
    dicc = {}
    for charac in keys:
        dicc[charac] = 0
    return dicc

#------------------------------------------------------
# GET SURFACE
#------------------------------------------------------
def getSurface(df):
    surfaces = df['surface_total_in_m2'].tolist()
    if 'description' not in df:
        return surfaces
    dfSize = len(df.index)
    descriptions = df['description'].tolist()
    for i in range(0, dfSize):
        if type(descriptions[i]) != type(""):
            validateSurface(surfaces[i], 0)
            continue
        surfaceCalculated = 0
        words = descriptions[i].split()
        wordsSize = len(words)
        for pos in range(0, wordsSize):
            triple = findSurface(words, pos, surfaceCalculated)
            surfaceCalculated, offset, iFound = triple
            if iFound:
                pos += offset
        surfaces[i] = validateSurface(surfaces[i], surfaceCalculated)
    return surfaces

# ------------------------------------------------------------------------------
# VALIDATE SURFACE
# ------------------------------------------------------------------------------
def validateSurface(surface, surfaceCalculated):
    if np.isnan(surface):
        return surfaceCalculated
    if surface != surfaceCalculated:
        return surface
    if surface == surfaceCalculated:
        return surface
    
# ------------------------------------------------------------------------------
# ENCONTRAR SUPERFICIE
# ------------------------------------------------------------------------------
def findSurface(words, i, surface):
    size = len(words)
    offset = 0 
    iFound = False
    if isFloat(words[i]):
        if (i + 2 < size) and words[i+1].lower() == "x" and isFloat(words[i+2]):
            a = float(words[i])
            b = float(words[i+2])        
            surface += a*b
            return surface, offset, iFound
    return surface, offset, iFound

# ------------------------------------------------------------------------------
# ENCONTRAR CANTIDAD DE AMBIENTES
# ------------------------------------------------------------------------------
def FindsNumberOfRooms(words, i, numberOfRooms):
    rooms = [
        "living", 
        "comedor", 
        "cocina", 
        "lavadero", 
        "dormitorio", 
        "vestidor",
        "baño"
    ]
    end = False
    size = len(words)
    if words[i].isdigit():
        if (i + 1 < size) and words[i+1].lower() == "ambientes":
            numberOfRooms = int(words[i])
            return numberOfRooms, end
        if (i + 1 < size) and words[i+1].lower() == "dormitorios":
            numberOfRooms += int(words[i])
            return numberOfRooms, not end
    if words[i].lower in rooms:
        numberOfRooms += 1
        return numberOfRooms, not end
    return numberOfRooms, not end

#------------------------------------------------------
# VALIDATE ROOM
#------------------------------------------------------
def validateRoom(numberOfRooms, numberOfRoomsCalculated):
    if np.isnan(numberOfRooms):
        return numberOfRoomsCalculated
    if numberOfRooms != numberOfRoomsCalculated:
        return numberOfRooms
    if numberOfRooms == numberOfRoomsCalculated:
        return numberOfRooms
    
#------------------------------------------------------
# GET ROOMS
#------------------------------------------------------
def getRooms(df):
    rooms = df['rooms'].tolist()
    if 'description' not in df:
        return rooms
    dfSize = len(df.index)
    descriptions = df['description'].tolist()
    for i in range(0, dfSize):
        if type(descriptions[i]) != type(""):
            validateRoom(rooms[i], 0)
            continue
        numberOfRooms = 0
        words = descriptions[i].split()
        wordsSize = len(words)
        for pos in range(0, wordsSize):
            numberOfRooms, end = FindsNumberOfRooms(words, pos, numberOfRooms)
            if end:
                break
        rooms[i] = validateRoom(rooms[i], numberOfRooms)
    return rooms
# ------------------------------------------------------------------------------
# ENCONTRAR FRASE
# ------------------------------------------------------------------------------
# pre: Recibe un vector de palabras, la posicion en ese vector que se
# esta leyendo, y un vector de frases
# pos: Devuelve una tripla que dice si alguna de las frases en el
# vector "phrases", el offset el cual debe desplazarse la posicion de lectura
# del vector "words" y el indice en donde se encuantrala frase encontrada en el
# vector "phrases"

def encontrar_frase(words, i, phrases):
    offset = 0
    index = 0
    for phrase in phrases:
        if words[i].lower() in phrase:
            phrase_split = phrase.split()
            size = len(phrase_split)
            offset = size
            index = phrases.index(phrase)
            for j in range(0, size):
                if (i + j < size) and (words[i + j] != phrase_split[j]):
                    return False, offset, index
            return True, offset, index
    return False, offset, index


# ------------------------------------------------------------------------------
# CREAR DICCIONARIO DESCRIPCION
# ------------------------------------------------------------------------------
# pre: Recibe un dataframe
# pos: Devuelve una lista de diccionarios

def crear_diccionario_descripcion(df):
    characteristics = [
        "living",
        "cochera",
        "comedor",
        "pileta",
        "piscina"
    ]
    phrases = [
        "cancha de tenis",
        "club house",
        "sector de juegos infantiles",
        "futbol 5",
        "seguridad las 24 hs"
    ]
    size = len(df.index)
    dicc_list = []
    for i in range(0, size):
        dicc = inicializar_diccionario(characteristics + phrases)
        if 'description' not in df:
            dicc_list.append(dicc)
            continue
        description = list(df['description'])
        if type(description[i]) != type(""):
            dicc_list.append(dicc)
            continue
        words = description[i].split()
        lenght = len(words)
        for j in range(0, lenght):
            (wordBelongs, offset, index) = encontrar_frase(words, j, phrases)
            if wordBelongs:
                j += offset
                if j >= lenght:
                    break
                dicc[phrases[index]] = true
            if words[j].lower() in characteristics:
                dicc[words[j].lower()] = true
        dicc_list.append(dicc)
    return dicc_list

# ------------------------------------------------------------------------------
#
# ------------------------------------------------------------------------------

# Funciones de filtrado del data set

In [4]:
#----------------------------------------------------------------------
# Filtro de propiedades con precio calculable. Devuelve 1 si es válido. 
# De lo contrario, nan (Not A Number)
#----------------------------------------------------------------------
def filterPercentage(array):
    priceUSD, usdM2, surfaceTotal = array
    if priceUSD <= 0:
        return np.nan
    price = usdM2 * surfaceTotal
    dif = abs(price - priceUSD)
    if ((dif / priceUSD) * 100) <= 10:
        return 1
    return np.nan
#----------------------------------------------------------------------
# Filtro de propiedades con precio calculable. Devuelve 1 si es válido. 
# De lo contrario, nan (Not A Number)
#----------------------------------------------------------------------
def filterImposibles(array):
    priceUSD, usdM2, surfaceTotal = array
    if (np.isnan(surfaceTotal) or surfaceTotal <= 0) and (not np.isnan(priceUSD) and not np.isnan(usdM2)):
        return 1
    if (np.isnan(priceUSD) and (not np.isnan(surfaceTotal) or surfaceTotal > 0)) and (not np.isnan(usdM2)):
        return 1
    if (np.isnan(usdM2) and (not np.isnan(surfaceTotal) or surfaceTotal > 0)) and (not np.isnan(priceUSD)):
        return 1
    if (not np.isnan(usdM2) and (not np.isnan(surfaceTotal) or surfaceTotal > 0)) and (not np.isnan(priceUSD)):
        return 1
    return np.nan
#----------------------------------------------------------------------
# Cálculo del precio aproximado de venta
#----------------------------------------------------------------------
def fillPrice(array):
    priceUSD, usdM2, surfaceTotal = array
    if np.isnan(priceUSD) and not np.isnan(usdM2):
        return (usdM2 * surfaceTotal)
    return priceUSD
#----------------------------------------------------------------------
# Cálculo del precio del metro cuadrado
#----------------------------------------------------------------------
def fillM2(array):
    priceUSD, usdM2, surfaceTotal = array
    if surfaceTotal <= 0:
        return np.nan
    if not np.isnan(priceUSD) and np.isnan(usdM2):
        return (priceUSD / surfaceTotal)
    return usdM2
#----------------------------------------------------------------------
# Cálculo de la superficie
#----------------------------------------------------------------------
def fillSurface(array):
    priceUSD, usdM2, surfaceTotal = array
    if not np.isnan(priceUSD) and np.isnan(usdM2):
        return (priceUSD / usdM2)
    return surfaceTotal
#----------------------------------------------------------------------
# Obtenemos el año y mes del nombre de archivo
#----------------------------------------------------------------------
def addDate(date, df):
    date_splitted = archive.split('-')
    month = date_splitted[3]
    year = date_splitted[2]
    date = year + '-' + month
    size = len(df.index)
    dates = pd.Series([date for i in range(0, size)])
    # y lo ponemos como dato en una columna
    df['date'] = dates
    df.loc[:, ['date']] = pd.to_datetime(df['date'], errors = 'coerce')
    return df
#----------------------------------------------------------------------
# Durante la carga de datos, se eliminan ciertas columnas que nos 
#resultan irrelevantes para el trabajo
#----------------------------------------------------------------------
def filterUnnecesaryColumns(df, isId):
    if 'surface_covered_in_m2' in df:
        df.drop('surface_covered_in_m2', axis = 1, inplace = True)
    if 'country_name' in df:
        df.drop('country_name', axis = 1, inplace = True)
    if 'price_aprox_local_currency' in df:
        df.drop('price_aprox_local_currency', axis = 1, inplace = True)
    if 'expenses' in df:
        df.drop('expenses', axis = 1, inplace = True)
    if 'properati_url' in df:
        df.drop('properati_url', axis = 1, inplace = True)
    if 'extra' in df:
        df.drop('extra', axis = 1, inplace = True)
    if 'geonames_id' in df:
        df.drop('geonames_id', axis = 1, inplace = True)
    if 'image_thumbnail' in df:
        df.drop('image_thumbnail', axis = 1, inplace = True)
    if 'operation' in df:
        df.drop('operation', axis = 1, inplace = True)
    if 'created_on' in df:
        df.drop('created_on', axis = 1, inplace = True)
    if 'lat-lon' in df:
        df.drop('lat-lon', axis = 1, inplace = True)
    if 'currency' in df:
        df.drop('currency', axis = 1, inplace = True)
    if 'title' in df:
        df.drop('title', axis = 1, inplace = True)
    if not isId and 'id' in df:
        df.drop('id', axis = 1, inplace = True)
    if 'price_aprox_local_currency' in df:
        df.drop('price_aprox_local_currency', axis = 1, inplace = True)
    if 'price_aprox_usd' in df and 'price' in df:
        df.drop('price', axis = 1, inplace = True)
    if 'extra' in df and 'price' in df:
        df.drop('extra', axis = 1, inplace = True)
    return df
#----------------------------------------------------------------------
# ADD DESCRIPTIONS COLUMNS
#----------------------------------------------------------------------
def addDescriptionColumns(df, columDict):
    size = len(df.index)
    description = list(df[columDict])
    keys = description[0].keys()
    for key in keys:
        colum = []
        for i in range(0, size):
            value = description[i][key]
            colum.append(value)
        df[key] = colum
    return df
#----------------------------------------------------------------------
# CHANGE PLACE WITH PARENT NAMES COLUMN TO NUMBER
#----------------------------------------------------------------------
def changePlaceWithParentsNamesColumn(df):    
    listPlaces = df['place_with_parent_names'].tolist()
    size = len(listPlaces)
    for i in range(0, size):
        listPlaces[i] = PlaceToNumber(listPlaces[i])
    df['place_with_parent_names'] = listPlaces
    return df
#----------------------------------------------------------------------
# PLACE TO NUMBER
#----------------------------------------------------------------------
def PlaceToNumber(x):
    CF = 0
    GBA = 1
    x = str(x)
    if 'Capital Federal' in x:
        return CF
    else:
        return GBA
#----------------------------------------------------------------------
# CHANGE PROPERTY TYPE COLUMN TO NUMBER
#----------------------------------------------------------------------
def changePropertyTypeColumn(df):    
    listPropertyType = df['property_type'].tolist()
    size = len(listPropertyType)
    for i in range(0, size):
        listPropertyType[i] = propertyTypeToNumber(listPropertyType[i])
    df['property_type'] = listPropertyType
    return df
#----------------------------------------------------------------------
# PROPERTY TYPE TO NUMBER
#----------------------------------------------------------------------
def propertyTypeToNumber(x):
    ph = 0
    apartment = 1
    house = 2
    store = 3
    if x.lower() == "apartment" or x.lower() == "departamento":
        return apartment
    if x.lower() == "ph":
        return ph
    if x.lower() == "house" or x.lower() == "casa":
        return house
    if x.lower() == "store":
        return store
#----------------------------------------------------------------------
# Inclui cero si es casa y floor es nan
#----------------------------------------------------------------------
def aggFloor(floor):
    if np.isnan(floor):
        return 0
    else:
        return floor

In [5]:
properties = []
archivesProceced = []

# Loop principal del filtrado de cada dataframe del set de entrenamiento

In [35]:
"""# Ruta de la carpeta con los archivos de datos modificados
root = "./properties/"
indexAcum = 0
for archive in listdir(root):
    if ".csv" not in archive:
        continue
    if archive in archivesProceced:
        continue
    df = pd.read_csv(root + archive, low_memory = False)
    
    df = df.loc[df.place_with_parent_names.str.contains('Capital Federal') \
        | df.place_with_parent_names.str.contains('Bs.As. G.B.A.'), :]
    
    # Durante la carga de datos, se eliminan ciertas columnas que nos 
    # resultan irrelevantes para el trabajo.
    df = filterUnnecesaryColumns(df, False)

    # En algunos casos, es necesario renombrar algunas columnas
    if 'price_aprox_usd' not in df:
        df.rename(columns = {'price': 'price_aprox_usd'}, inplace = True)
    if 'surface_total_in_m2' not in df:
        df.rename(columns = {'surface_in_m2': 'surface_total_in_m2'}, \
            inplace = True)

    # Aquí reconvertimos algunas columnas a punto flotante
    df.loc[:, 'price_aprox_usd'] = df.loc[:, ['price_aprox_usd']]\
        .apply(lambda x: float(x), axis = 1)
    df.loc[:, 'price_usd_per_m2'] = df.loc[:, ['price_usd_per_m2']]\
        .apply(lambda x: float(x), axis = 1)

    # Obtenemos el año y mes del nombre de archivo
    df = addDate(archive, df)

    # Antes de filtrar me fijo si puedo recuperar la superficie del campo descripcion
    df['surface_total_in_m2'] = getSurface(df)
    
    # Aquí aplicamos el filtro antes declarado
    df['filter1'] = df.loc[:, ['price_aprox_usd', 'price_usd_per_m2', \
            'surface_total_in_m2']].apply(lambda x: filterImposibles(x), axis = 1)
    df = df[df['filter1'] == 1]
    df.drop('filter1', axis = 1, inplace = True)
    
    size = len(df.index)
    if size == 0:
        continue
    
    df.loc[:, ['price_aprox_usd']] = df.loc[:, ['price_aprox_usd', \
    'price_usd_per_m2', 'surface_total_in_m2']].apply(lambda x: fillPrice(x), axis = 1)
    
    df.loc[:, ['price_usd_per_m2']] = df.loc[:, ['price_aprox_usd', \
    'price_usd_per_m2', 'surface_total_in_m2']].apply(lambda x: fillM2(x), axis = 1)
    
    df.loc[:, ['surface_total_in_m2']] = df.loc[:, ['price_aprox_usd', \
    'price_usd_per_m2', 'surface_total_in_m2']].apply(lambda x: fillSurface(x), axis = 1)
    
    df['filter2'] = df.loc[:, ['price_aprox_usd', 'price_usd_per_m2', \
    'surface_total_in_m2']].apply(lambda x: filterPercentage(x), axis = 1)
    df = df[df['filter2'] == 1]
    df.drop('filter2', axis = 1, inplace = True)
    
    #-------------------------------------------------------------
    df.loc[:, ['place_with_parent_names']] = df.loc[:, ['place_with_parent_names']]\
    .apply(lambda x: PlaceToNumber(x), axis = 1)
    df = changePropertyTypeColumn(df)
    #df.loc[:, ['floor']] = df.loc[:, ['floor']].apply(lambda x: aggFloor(x), axis = 1)
    #-------------------------------------------------------------
    
    # Si el filtrado es tal que me quedo sin dataframe, 
    # entonces salto a la siguiente iteracion
    size = len(df.index)
    if size == 0:
        continue
    
    #Obtengo los campos de descripcion
    df['rooms'] = getRooms(df)
    df['description'] = crear_diccionario_descripcion(df)
    df = addDescriptionColumns(df, 'description')
    df.drop('description', axis = 1, inplace = True)
    
    # Finalmente, guardamos los archivos modificados.
    indexAcum += size
    newIndex = [i for i in range(indexAcum, indexAcum+size)]
    df.reindex(newIndex)
    print archive
    properties.append(df)
    archivesProceced.append(archive)
#-------------------------------------------------------------------
#-------------------------------------------------------------------
#Genero un nuevo csv con la concatenacion de todos ellos en uno solo
general = pd.concat(properties)

#Borro las columnas vacias
for column in general.columns.values:
    if 'unnamed' not in column.lower():
        continue
    general.drop(column, axis = 1, inplace = True)

general.loc[:, ['date']] = pd.to_datetime(general['date'], errors = 'coerce')

#Grabo la concatenacion en un unico csv
try:
    general.to_csv("propertiesConCat.csv", index = True, header = True, \
        sep = ',', encoding = 'utf-8-sig')
    print('Done')
except value:
    print('Error')"""

properati-AR-2017-05-01-properties-sell.csv
properati-AR-2014-01-01-properties-sell.csv
properati-AR-2013-08-01-properties-sell.csv
properati-AR-2015-04-01-properties-sell.csv
properati-AR-2014-02-01-properties-sell.csv
properati-AR-2013-09-01-properties-sell.csv
properati-AR-2016-09-01-properties-sell.csv
properati-AR-2017-01-01-properties-sell.csv
properati-AR-2013-10-01-properties-sell.csv
properati-AR-2015-05-01-properties-sell.csv
properati-AR-2013-12-01-properties-sell.csv
properati-AR-2014-05-01-properties-sell.csv
properati-AR-2014-11-01-properties-sell.csv
properati-AR-2014-09-01-properties-sell.csv
properati-AR-2015-02-01-properties-sell.csv
properati-AR-2016-08-01-properties-sell.csv
properati-AR-2016-01-01-properties-sell.csv
properati-AR-2016-11-01-properties-sell.csv
properati-AR-2015-08-01-properties-sell.csv
properati-AR-2017-03-01-properties-sell.csv
properati-AR-2014-12-01-properties-sell.csv
properati-AR-2015-12-01-properties-sell.csv
properati-AR-2016-06-01-properti

In [36]:
print len(archivesProceced)

49


# Agregado de datos de extra

In [7]:
#---------------------------------------------------------
# CHANGE STRING COMMA FOR POINT
#---------------------------------------------------------
def changeStringCommaForPoint(string):
    point = "."
    split = string.split(',')
    return point.join(split)
#---------------------------------------------------------
# CHANGE STRING LIST COMMA FOR POINT
#---------------------------------------------------------
def changeStringListCommaForPoint(stringList):
    aux = []
    for string in stringList:
        aux.append(changeStringCommaForPoint(string))
    return aux
#---------------------------------------------------------
# MANHATTAN DISTANCE
#---------------------------------------------------------
def ManhattanDistance(lat1, lon1, lat2, lon2):
    # pasamos la diferencia a metros (90° son 10000 Km)
    dlat = abs(lat1-lat2) * (10000/90)
    # pasamos la diferencia a metros (360° son 40000 Km)
    dlon = abs(lon1-lon2) * (40000/360) 
    distKM = ( (dlat ** 2) + (dlon ** 2) ) ** (0.5)
    return float(distKM * 1000)
#---------------------------------------------------------
# DISTANCE ANALYSIS
#---------------------------------------------------------
def distanceAnalysis(df, extraDf, lat, lon, distanceName):
    # x) lon
    # y) lat
    extraDf.loc[:, [lon]] = extraDf.loc[:, [lon]].apply(lambda x: float(x), axis = 1)
    extraDf.loc[:, [lat]] = extraDf.loc[:, [lat]].apply(lambda x: float(x), axis = 1)
    extraDf = extraDf[~np.isnan(extraDf[lon]) | ~np.isnan(extraDf[lat])]

    df = df[~np.isnan(df['lon']) | ~np.isnan(df['lat'])]

    latDf = df['lat'].tolist()
    lonDf = df['lon'].tolist()
    x = extraDf[lon].tolist()
    y = extraDf[lat].tolist()

    distances = []
    minor = 0

    for i in range(0, len(latDf)):
        minor = ManhattanDistance(y[0], x[0], latDf[i], lonDf[i])
        for j in range(1, len(x)):
            dist = ManhattanDistance(y[j], x[j], latDf[i], lonDf[i])
            if (dist < minor):
                minor = dist
        distances.append(minor)

    df[distanceName] = distances
    return df
#---------------------------------------------------------
# AGG SUBWAYS
#---------------------------------------------------------
def aggSubways(df):
    subways = pd.read_csv("./extra/estaciones-de-subte.csv", low_memory = False)
    df = distanceAnalysis(df, subways, 'Y', 'X', 'distanceSubway')
    return df
#---------------------------------------------------------
# AGG UNIVERSITIES
#---------------------------------------------------------
def aggUniversities(df):
    universities = pd.read_csv("./extra/universidades.csv", low_memory = False, sep=';')
    df = distanceAnalysis(df, universities, 'LAT', 'LNG', 'distanceUniversities')
    return df
#---------------------------------------------------------
# AGG HOSPITALES
#---------------------------------------------------------
def aggHospitales(df):
    hospitales = pd.read_csv("./extra/hospitales.csv", low_memory = False, sep=';')
    df = distanceAnalysis(df, hospitales, 'LAT', 'LNG', 'distanceHospitales')
    return df
#---------------------------------------------------------
# AGG BUS STOPS
#---------------------------------------------------------
def aggBusStops(df):
    busStops = pd.read_csv("./extra/paradas-de-colectivo.csv", low_memory = False, sep=';')
    busStops['X'] = changeStringListCommaForPoint(busStops['X'].tolist())
    busStops['Y'] = changeStringListCommaForPoint(busStops['Y'].tolist())
    df = distanceAnalysis(df, busStops, 'Y', 'X', 'distanceBusStops')
    return df

# Proceso de filtrado y acomodamiento de los datos para entrenamiento

In [458]:
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# FUNCTIONS OF PROCCES OF TRAIN DATA
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#---------------------------------------------------------
# HASH PLACES
#---------------------------------------------------------
def hashPlaces(df, predictDf):
    placesHash = {}
    groupedPlaces = df.loc[:, ['place_name', 'floor']]\
    .groupby('place_name').agg([np.size]).reset_index()
    places = groupedPlaces['place_name'].tolist()
    groupedPlaces = predictDf.loc[:, ['place_name', 'floor']]\
    .groupby('place_name').agg([np.size]).reset_index()
    places = places + groupedPlaces['place_name'].tolist()
    size = len(places)
    for i in range(0, size):
        if places[i] in placesHash:
            continue
        placesHash[places[i]] = 0
    keys = placesHash.keys()
    size = len(keys)
    for i in range(0, size):
        placesHash[keys[i]] = i
    placesHash["no place"] = size
    return placesHash
#---------------------------------------------------------
# PROCCES NO PLACES
#---------------------------------------------------------
def processNoPlaces(df):
    places = df['place_name'].tolist()
    size = len(places)
    for i in range(0, size):
        if type(places[i]) != type(""):
            places[i] = "no place"
    df['place_name'] = places
    return df
#---------------------------------------------------------
# CNVERT PLACES TO HASH NUMBER
#---------------------------------------------------------
def convertPlacesToHashNumber(df, placesHash):   
    placesDataTrain = df['place_name'].tolist()
    size = len(placesDataTrain)
    count = 0
    hashSize = len(placesHash.keys())
    for i in range(0, size):
        placeName = placesDataTrain[i]
        placesDataTrain[i] = placesHash[placeName]
    df['place_name'] = placesDataTrain
    return df
#---------------------------------------------------------
# DELETE EXTRA COLUMNS
#---------------------------------------------------------
def deleteExtraColumns(df):
    extra = [
        'sector de juegos infantiles',
        'seguridad las 24 hs',
        'cancha de tenis',
        'club house',
        'cochera',
        'comedor',
        'futbol 5',
        'living',
        'pileta',
        'piscina',
        'rooms'
    ]
    columns = [
        'price_aprox_usd.1', 
        'Unnamed: 0', 
        'price_per_m2', 
        'date',
        'floor', 
        'state_name', 
        'price_usd_per_m2'
    ]
    df.drop(columns+extra, axis = 1, inplace = True)
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df
#---------------------------------------------------------
# DELETE TRASH COLUMNS
#---------------------------------------------------------
def deleteTrashColumns(df):
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df
#---------------------------------------------------------
# PROCESS TRAIN SET
#---------------------------------------------------------
def ProcessTrainSet(df, withPrice):
    df.dropna(axis=0, how='any', subset=list(df.columns.values), inplace=True)
    
    target = df['price_aprox_usd'].tolist()
    if withPrice:
        df.drop('price_aprox_usd', axis = 1, inplace = True)
    
    columns = list(df.columns.values)
    data = list(df.values)
    return data, target

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# FUNCTIONS OF PROCCES OF TEST DATA
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#---------------------------------------------------------
# DELETE COLUMNS TEST DATA
#---------------------------------------------------------
def deleteColumnsTestData(df, columnsToEvaluate):
    columns = list(df.columns.values)
    for column in columns:
        if column in columnsToEvaluate:
            continue
        df.drop(column, axis = 1, inplace = True)
    return df
#---------------------------------------------------------
# PRE PROCCES TEST DATA
#---------------------------------------------------------
def preProcessTestData(df, placesHash):
    df = filterUnnecesaryColumns(df, True)
    df['rooms'] = getRooms(df)
    #df['surface_total_in_m2'] = getSurface(df)
    df['description'] = crear_diccionario_descripcion(df)
    df = processNoPlaces(df)
    df = convertPlacesToHashNumber(df, placesHash)
    df = addDescriptionColumns(df, 'description')
    df = changePlaceWithParentsNamesColumn(df)
    df = changePropertyTypeColumn(df)
    return df
#---------------------------------------------------------
# GET DATA AS LIST
#---------------------------------------------------------
def GetDataAsList(df):
    columns = list(df.columns.values)
    data = list(df.values)
    return data
#------------------------------------------------------
# GET AVG OF LIST
#------------------------------------------------------
def getAvgOfList(x):
    surfaces = x['surface_total_in_m2'].tolist()
    surfaces = [x for x in surfaces if (not np.isnan(x) and x > 0)]
    size = len(surfaces)
    if size == 0:
        return np.nan
    surfaces.sort()
    return surfaces[size/2]

#------------------------------------------------------
# CREATE DICC
#------------------------------------------------------
def createDicc(keys, values):
    dicc = {}
    size = len(keys)
    for i in range(0, size):
        dicc[keys[i]] = values[i]
    return dicc
#------------------------------------------------------
# GET AVG SURFACE BY PLACE
#------------------------------------------------------
def getAvgSurfaceByPlace(df):
    aux = df.loc[:, ['surface_total_in_m2', 'place_name']]\
    .groupby('place_name').apply(lambda x: getAvgOfList(x)).reset_index()
    
    placesName = aux['place_name'].tolist()
    surfaces = aux[0].tolist()
    dicc = createDicc(placesName, surfaces)
    
    placesNameDF = df['place_name'].tolist()
    surfacesDF = df['surface_total_in_m2'].tolist()
    size = len(surfacesDF)
    for i in range(0, size):
        if not np.isnan(surfacesDF[i]) and surfacesDF[i] > 0:
            continue
        surfacesDF[i] = dicc[placesNameDF[i]]
    df['surface_total_in_m2'] = surfacesDF
    return df

#------------------------------------------------------
# GET AVG SURFACE BY PLACE
#------------------------------------------------------
def getAvgSurfaceByState(df):
    aux = df.loc[:, ['surface_total_in_m2', 'state_name']]\
    .groupby('state_name').apply(lambda x: getAvgOfList(x)).reset_index()
    
    states = aux['state_name'].tolist()
    surfaces = aux[0].tolist()
    dicc = createDicc(states, surfaces)
    
    statesDF = df['state_name'].tolist()
    surfacesDF = df['surface_total_in_m2'].tolist()
    size = len(surfacesDF)
    for i in range(0, size):
        if not np.isnan(surfacesDF[i]) and surfacesDF[i] > 0:
            continue
        ## Caso especial donde Buenos Aire interior 
        ## hay solo una propiedad con todo en nan
        if statesDF[i] == 'Buenos Aires Interior':
            avgGBAO = dicc['Bs.As. G.B.A. Zona Oeste']
            avgGBAS = dicc['Bs.As. G.B.A. Zona Sur']
            avgBAI = (avgGBAO + avgGBAS) / 2
            surfacesDF[i] = avgBAI
        else:
            surfacesDF[i] = dicc[statesDF[i]]
    df['surface_total_in_m2'] = surfacesDF
    return df

#---------------------------------------------------------
# SAVE FINAL DF
#---------------------------------------------------------
def saveFinalDF(predictions, ids):
    aData = {'id': ids, 'price_usd': predictions}
    final = pd.DataFrame(data = aData)
    """final['id'] = ids
    final['price_usd'] = predictions
    final = final.reset_index()
    final.drop('index', axis = 1, inplace = True)"""
    final.to_csv("properati_dataset_sample_submision.csv", \
    index = True, header = True, sep = ',', encoding = 'utf-8-sig')
    return final
#---------------------------------------------------------
# FILL COORDS BY STATE NAME
#---------------------------------------------------------
def fillCoordsByStateName(df, coordinatesDicc):
    ok = True
    for key in coordinatesDicc.keys():
        coord = coordinatesDicc[key]
        if not np.isnan(coord[0]) and not np.isnan(coord[0]):
            continue
        ok = False
        break
    if ok:
        return
        
    aux = df.loc[:, ['state_name', 'lat', 'lon']]\
    .groupby('state_name').agg([np.mean, np.size]).reset_index()
    aux = aux[aux[('lat', 'size')] >= 20]
    
    states = aux['state_name'].tolist()
    latList = aux[('lat', 'mean')].tolist()
    lonList = aux[('lon', 'mean')].tolist()
    size = len(states)

    latGBA = 0
    lonGBA = 0
    latCF = 0
    lonCF = 0
    for i in range(0, size):
        if states[i] == "Bs.As. G.B.A. Zona Oeste" or states[i] == "Bs.As. G.B.A. Zona Sur":
            latGBA += latList[i]
            lonGBA += lonList[i]
        else:
            latCF += latList[i]
            lonCF += lonList[i]
    latGBA = latGBA/2
    lonGBA = lonGBA/2
    
    states = df['state_name'].tolist()
    places = df['place_name'].tolist()
    latList = df['lat'].tolist()
    lonList = df['lon'].tolist()
    size = len(states)
    for i in range(0, size):
        coord = coordinatesDicc[places[i]]
        if not np.isnan(coord[0]) and not np.isnan(coord[1]):
            continue
        if states[i] == 'Capital Federal':
            coordinatesDicc[places[i]] = [latCF, lonCF]
        else:
            coordinatesDicc[places[i]] = [latGBA, lonGBA]
    return coordinatesDicc
#---------------------------------------------------------
# CREATE DICT OF COORDS BY PLACES
#---------------------------------------------------------
def createDictOfCoordsByPlaces(df):
    aux = df.loc[:, ['place_name', 'lat', 'lon']]\
    .groupby('place_name').agg([np.mean, np.size]).reset_index()
    coordinatesDicc = {}
    places = aux['place_name'].tolist()
    latList = aux[('lat', 'mean')].tolist()
    lonList = aux[('lon', 'mean')].tolist()
    size = len(places)
    for i in range(0, size):
        coordinatesDicc[places[i]] = [latList[i], lonList[i]]
    coordinatesDicc = fillCoordsByStateName(df, coordinatesDicc)
    return coordinatesDicc
#---------------------------------------------------------
# FILL COORDINATES DATA TEST
#---------------------------------------------------------
def fillCoordinatesDataTest(df):
    coordinatesDicc = createDictOfCoordsByPlaces(df)
    
    places = df['place_name'].tolist()
    latList = df['lat'].tolist()
    lonList = df['lon'].tolist()
    size = len(places)
    
    for i in range(0, size):
        if np.isnan(latList[i]):
            latList[i] = coordinatesDicc[places[i]][0]
        if np.isnan(lonList[i]):
            lonList[i] = coordinatesDicc[places[i]][1] 
    df['lat'] = latList
    df['lon'] = lonList
    return df

In [459]:
df = pd.read_csv("propertiesConCat.csv", low_memory = False)
predictDf = pd.read_csv("properati_dataset_testing_noprice.csv", low_memory = False)

In [460]:
"""df.loc[:,'date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.loc[df['date'].dt.year >= 2016]
df = getAvgSurfaceByPlace(df)
df = getAvgSurfaceByState(df)
df = aggSubways(df)
df = aggUniversities(df)
df = aggHospitales(df)
#df = aggBusStops(df)

placesHash = hashPlaces(df, predictDf)
df = processNoPlaces(df)
df = convertPlacesToHashNumber(df, placesHash)
df = deleteExtraColumns(df)
df.to_csv("dataTrain.csv", index = True, header = True, sep = ',', 
          encoding = 'utf-8-sig')"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Process

In [461]:
df = pd.read_csv("dataTrain.csv", low_memory = False)

In [462]:
df = deleteTrashColumns(df)

In [463]:
dataTrain, targetTrain = ProcessTrainSet(df, True)

In [464]:
columns = list(df.columns.values)
print "data size: ", len(dataTrain)
print "target size: ", len(targetTrain)
print "columns size: ", len(columns)

data size:  269446
target size:  269446
columns size:  9


# Preproceso del set de test 

In [465]:
columnsToEvaluate = columns
columnsToEvaluate

['lat',
 'lon',
 'place_name',
 'place_with_parent_names',
 'property_type',
 'surface_total_in_m2',
 'distanceSubway',
 'distanceUniversities',
 'distanceHospitales']

In [466]:
predictDf = pd.read_csv("properati_dataset_testing_noprice.csv", low_memory = False)

In [467]:
"""predictDf = getAvgSurfaceByPlace(predictDf)
predictDf = getAvgSurfaceByState(predictDf)
predictDf = fillCoordinatesDataTest(predictDf)
predictDf = fillCoordinatesDataTest(predictDf)
predictDf = aggSubways(predictDf)
predictDf = aggUniversities(predictDf)
predictDf = aggHospitales(predictDf)
aux = predictDf
places = aux['place_name'].tolist()
predictDf = preProcessTestData(predictDf, placesHash)
ids = predictDf['id']
predictDf = deleteColumnsTestData(predictDf, columnsToEvaluate)
data = GetDataAsList(predictDf)
predictDf.to_csv("dataTest.csv", index = True, header = True, sep = ',', 
          encoding = 'utf-8-sig')"""

In [468]:
print "data size: ", len(data)
print "df size: ", len(predictDf.index)
print "columns size: ", len(predictDf.columns.values)
#row should be 14166

data size:  14166
df size:  14166
columns size:  9


In [469]:
predictDf = pd.read_csv("dataTest.csv", low_memory = False)
predictDf = deleteTrashColumns(predictDf)

# PreProcces sample

In [470]:
from sklearn.cross_validation import train_test_split

In [471]:
xTrain, xTest, yTrain, yTest = train_test_split(dataTrain, targetTrain, test_size=0.5, random_state=0)

# Prediccion 

#  Linear Regression

Entreno el set de entrenamiento

In [24]:
lr = LinearRegression(normalize = True)
lr.fit(dataTrain, targetTrain)
lr.score(dataTrain, targetTrain)

precision sample:  0.098377413181


In [25]:
predictions = lr.predict(data)

In [26]:
final = saveFinalDF(predictions, ids)

In [27]:
final.head()

Unnamed: 0,id,price_usd
0,3632,12474840.0
1,3633,5288541.0
2,2263404,8702557.0
3,2263405,4927972.0
4,2263406,6199079.0


# Transformacion no lineal a lineal

#  (Este metodo da precios demasiado altos y es peor que el lineal hecho en el paso anterior)

Si tomamos como ejemplo una función f que toma la forma :  f(x) = a + bx + cx²

La función f es no lineal en función de x pero si es lineal en función de los parámetros desconocidos a, b, y c. O visto de otra manera: podemos sustituir nuestras variables x por un array z tal que: z = [1, x, x²]. Con el que podríamos reescribir nuestra función f como f(z) = a z0 + bz1 + c*z2

Scikit-learn tiene un objeto PolynomialFeatures que nos va a servir para convertir nuestra variable x en un array z del tipo z = [1, x, x2, …, n^n], que es lo que nos interesa.

El resultado de esa transformación se la pasamos a nuestro modelo Ridge. Para facilitar la tarea en este tipo de casos —donde se realizan varios pasos que van desde el pre-tratamiento de los datos hasta un posible post-tratamiento pasando por el entrenamiento—, podemos hacer uso de las Pipeline que nos permiten encadenar multiples estimadores en uno. Esto es especialmente útil cuando hay secuencia de pasos predefinidos en el procesado de datos con, por ejemplo, selección de atributos, normalización y clasificación.

In [24]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [25]:
poly = PolynomialFeatures(degree = 2)
z = poly.fit_transform(xTrain)
lr = LinearRegression(normalize=True)
lr.fit(z, yTrain)
lr.score(z, yTrain)

0.18679831133031699

In [27]:
zTest = poly.fit_transform(xTest)
predictionsSamplePlynomial = lr.predict(zTest)
print "precision sample: ", lr.score(zTest, yTest)

 precision sample:  0.152641099085


In [28]:
DataTransformed = poly.fit_transform(data)
predictions = lr.predict(DataTransformed)

In [210]:
final = saveFinalDF(predictions, ids)

In [211]:
final.head()

Unnamed: 0,id,price_usd
0,3632,1.001339e+21
1,3633,8.158107e+21
2,2263404,5.498861e+21
3,2263405,8.590666e+20
4,2263406,8.654199e+20


# Decicion Tree

In [472]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV

In [473]:
params = {'max_depth': [i for i in range(50, 100)]}
gs = GridSearchCV(DecisionTreeRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [474]:
gs.fit(xTrain, yTrain)
yTestPrediction = gs.predict(xTest)
print "precision test: ", gs.score(xTest, yTest)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.3min finished


precision test:  0.816363469132


In [475]:
gs.fit(dataTrain, targetTrain)
predictionDT = gs.predict(data)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  3.7min finished


In [490]:
final = saveFinalDF(predictionDT, ids)

In [491]:
final.head()

Unnamed: 0,id,price_usd
0,3632,690000.0
1,3633,155000.0
2,2263404,71000.0
3,2263405,177000.0
4,2263406,55000.0


# Random Forest Regressor

In [482]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [483]:
params = {
    'max_depth': [4,6],
    'min_samples_leaf': [3, 5, 9],
    'max_features': [1.0, 0.3, 0.1]
}
gs = GridSearchCV(RandomForestRegressor(), params, cv=3, n_jobs=1, verbose=1, scoring=None, refit=True)

In [484]:
gs.fit(xTrain, yTrain)
gs.score(xTrain, yTrain)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  1.1min finished


0.57395798407226906

In [485]:
gs.predict(xTest)
print "score test: ", gs.score(xTest, yTest)

score test:  0.502154656875


In [486]:
gs.fit(dataTrain, targetTrain)
print "score full dataTrain: ", gs.score(xTrain, yTrain)
RFPrediction = gs.predict(data)
final = saveFinalDF(RFPrediction, ids)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  1.9min finished


score full dataTrain:  0.552024792787


In [489]:
final.head()

Unnamed: 0,id,price_usd
0,3632,780727.577026
1,3633,311615.522128
2,2263404,344923.313484
3,2263405,113885.326676
4,2263406,113885.326676
