In [1]:
import pandas as pd
import numpy as np
from os import listdir
from sklearn import datasets
from sklearn.linear_model import LinearRegression
%matplotlib inline
import matplotlib.pyplot as plt

# Preproceso del set de entrenamiento

In [2]:
columnsToUseInitialy = [
    'lat',
    'lon',
    'place_name',
    'property_type',
    'surface_total_in_m2',
    'surface_covered_in_m2',
    'price_aprox_usd',
    'price_usd_per_m2',
    'rooms',
    'place_with_parent_names'
]

# Agregado de datos de extra

In [3]:
#---------------------------------------------------------
# CHANGE STRING COMMA FOR POINT
#---------------------------------------------------------
def changeStringCommaForPoint(string):
    point = "."
    split = string.split(',')
    return point.join(split)
#---------------------------------------------------------
# CHANGE STRING LIST COMMA FOR POINT
#---------------------------------------------------------
def changeStringListCommaForPoint(stringList):
    aux = []
    for string in stringList:
        aux.append(changeStringCommaForPoint(string))
    return aux
#---------------------------------------------------------
# MANHATTAN DISTANCE
#---------------------------------------------------------
def ManhattanDistance(lat1, lon1, lat2, lon2):
    # pasamos la diferencia a metros (90° son 10000 Km)
    dlat = abs(lat1-lat2) * (10000/90)
    # pasamos la diferencia a metros (360° son 40000 Km)
    dlon = abs(lon1-lon2) * (40000/360) 
    distKM = ( (dlat ** 2) + (dlon ** 2) ) ** (0.5)
    return float(distKM * 1000)
#---------------------------------------------------------
# DISTANCE ANALYSIS
#---------------------------------------------------------
def distanceAnalysis(df, extraDf, lat, lon, distanceName):
    # x) lon
    # y) lat
    extraDf.loc[:, [lon]] = extraDf.loc[:, [lon]].apply(lambda x: float(x), axis = 1)
    extraDf.loc[:, [lat]] = extraDf.loc[:, [lat]].apply(lambda x: float(x), axis = 1)
    extraDf = extraDf[~np.isnan(extraDf[lon]) | ~np.isnan(extraDf[lat])]

    df = df[~np.isnan(df['lon']) | ~np.isnan(df['lat'])]

    latDf = df['lat'].tolist()
    lonDf = df['lon'].tolist()
    x = extraDf[lon].tolist()
    y = extraDf[lat].tolist()

    distances = []
    minor = 0

    for i in range(0, len(latDf)):
        minor = ManhattanDistance(y[0], x[0], latDf[i], lonDf[i])
        for j in range(1, len(x)):
            dist = ManhattanDistance(y[j], x[j], latDf[i], lonDf[i])
            if (dist < minor):
                minor = dist
        distances.append(minor)

    df[distanceName] = distances
    return df
#---------------------------------------------------------
# AGG SUBWAYS
#---------------------------------------------------------
def aggSubways(df):
    subways = pd.read_csv("./extra/estaciones-de-subte.csv", low_memory = False)
    df = distanceAnalysis(df, subways, 'Y', 'X', 'distanceSubway')
    return df
#---------------------------------------------------------
# AGG UNIVERSITIES
#---------------------------------------------------------
def aggUniversities(df):
    universities = pd.read_csv("./extra/universidades.csv", low_memory = False, sep=';')
    df = distanceAnalysis(df, universities, 'LAT', 'LNG', 'distanceUniversities')
    return df
#---------------------------------------------------------
# AGG HOSPITALES
#---------------------------------------------------------
def aggHospitales(df):
    hospitales = pd.read_csv("./extra/hospitales.csv", low_memory = False, sep=';')
    df = distanceAnalysis(df, hospitales, 'LAT', 'LNG', 'distanceHospitales')
    return df
#---------------------------------------------------------
# GET DATAFRAME COORDS BY PLACES
#---------------------------------------------------------
def getDataframeCoordsByPlaces(df):
    aux = df
    aux = aux.loc[:, ['lat', 'lon', 'place_name']]\
    .groupby('place_name').agg([np.mean, np.size]).reset_index()
    lat = aux[('lat', 'mean')]
    lon = aux[('lon', 'mean')]
    places = aux[('place_name', '')]
    aux = pd.DataFrame()
    aux['place_name'] = places
    aux['lat'] = lat
    aux['lon'] = lon
    return aux
#---------------------------------------------------------
# AGG BUS STOPS
#---------------------------------------------------------
def aggBusStops(df):
    infinite = 999999999999
    busStops = pd.read_csv("./extra/paradas-de-colectivo.csv", low_memory = False, sep=';')
    busStops['X'] = changeStringListCommaForPoint(busStops['X'].tolist())
    busStops['Y'] = changeStringListCommaForPoint(busStops['Y'].tolist())
    aux = df
    aux = getDataframeCoordsByPlaces(df)
    aux = distanceAnalysis(aux, busStops, 'Y', 'X', 'distanceBusStops')
    dicc = createDicc(aux['place_name'].tolist(), aux['distanceBusStops'].tolist())
    
    places = df['place_name'].tolist()
    size = len(places)
    distance = [None]*size
    for i in range(0, size):
        if places[i] not in dicc:
            distance[i] = infinite
        else:
            distance[i] = dicc[places[i]]
    df['distanceBusStops'] = distance
    return df
#---------------------------------------------------------
# DELETE TRASH COLUMNS
#---------------------------------------------------------
def deleteTrashColumns(df):
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df

# Funciones del uso de informacion del campo de descripcion

In [4]:
false = 0
true = 1
#------------------------------------------------------
# IS FLOAT
#------------------------------------------------------
def isFloat(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

#------------------------------------------------------
# INICIALIZAR DICCIONARIO
#------------------------------------------------------
# pre: Recibe una lista de claves
# pos: devuelve un diccionario de esas claves recibidas inicializadas en cero

def inicializar_diccionario(keys):
    dicc = {}
    for charac in keys:
        dicc[charac] = 0
    return dicc

#------------------------------------------------------
# ENCONTRAR FRASE
#------------------------------------------------------
# pre: Recibe un vector de palabras, la posicion en ese vector que se
# esta leyendo, y un vector de frases
# pos: Devuelve una tripla que dice si alguna de las frases en el
# vector "phrases", el offset el cual debe desplazarse la posicion de lectura
# del vector "words" y el indice en donde se encuantrala frase encontrada en el
# vector "phrases"

def encontrar_frase(words, i, phrases):
    offset = 0
    index = 0
    for phrase in phrases:
        if words[i].lower() in phrase:
            phrase_split = phrase.split()
            size = len(phrase_split)
            offset = size
            index = phrases.index(phrase)
            for j in range(0, size):
                if (i + j < size) and (words[i + j] != phrase_split[j]):
                    return False, offset, index
            return True, offset, index
    return False, offset, index


#------------------------------------------------------
# CREAR DICCIONARIO DESCRIPCION
#------------------------------------------------------
# pre: Recibe un dataframe
# pos: Devuelve una lista de diccionarios

def crear_diccionario_descripcion(df):
    characteristics = [
        "living",
        "cochera",
        "comedor",
        "pileta",
        "piscina"
    ]
    phrases = [
        "cancha de tenis",
        "club house",
        "sector de juegos infantiles",
        "futbol 5",
        "seguridad las 24 hs"
    ]
    size = len(df.index)
    dicc_list = []
    for i in range(0, size):
        dicc = inicializar_diccionario(characteristics + phrases)
        if 'description' not in df:
            dicc_list.append(dicc)
            continue
        description = list(df['description'])
        if type(description[i]) != type(""):
            dicc_list.append(dicc)
            continue
        words = description[i].split()
        lenght = len(words)
        for j in range(0, lenght):
            (wordBelongs, offset, index) = encontrar_frase(words, j, phrases)
            if wordBelongs:
                j += offset
                if j >= lenght:
                    break
                dicc[phrases[index]] = true
            if words[j].lower() in characteristics:
                dicc[words[j].lower()] = true
        dicc_list.append(dicc)
    return dicc_list

# Funciones de filtrado del data set

In [27]:
#----------------------------------------------------------------------
# FILTER PORCENTAGE
#----------------------------------------------------------------------
def filterPercentage(array):
    priceUSD, usdM2, surfaceTotal = array
    if priceUSD <= 0:
        return np.nan
    price = usdM2 * surfaceTotal
    dif = abs(price - priceUSD)
    if ((dif / priceUSD) * 100) <= 10:
        return 1
    return np.nan
#----------------------------------------------------------------------
# FILTER IMPOSSIBLES
#----------------------------------------------------------------------
def filterImposibles(array):
    priceUSD, usdM2, surfaceTotal = array
    if (np.isnan(surfaceTotal) or surfaceTotal <= 0) and (not np.isnan(priceUSD) and not np.isnan(usdM2)):
        return 1
    if (np.isnan(priceUSD) and (not np.isnan(surfaceTotal) or surfaceTotal > 0)) and (not np.isnan(usdM2)):
        return 1
    if (np.isnan(usdM2) and (not np.isnan(surfaceTotal) or surfaceTotal > 0)) and (not np.isnan(priceUSD)):
        return 1
    if (not np.isnan(usdM2) and (not np.isnan(surfaceTotal) or surfaceTotal > 0)) and (not np.isnan(priceUSD)):
        return 1
    return np.nan
#----------------------------------------------------------------------
# FILL PRICE
#----------------------------------------------------------------------
def fillPrice(array):
    priceUSD, usdM2, surfaceTotal = array
    if np.isnan(priceUSD) and not np.isnan(usdM2):
        return (usdM2 * surfaceTotal)
    return priceUSD
#----------------------------------------------------------------------
# FILL M2
#----------------------------------------------------------------------
def fillM2(array):
    priceUSD, usdM2, surfaceTotal = array
    if surfaceTotal <= 0:
        return np.nan
    if not np.isnan(priceUSD) and np.isnan(usdM2):
        return (priceUSD / surfaceTotal)
    return usdM2
#----------------------------------------------------------------------
# FILL SURFACE
#----------------------------------------------------------------------
def fillSurface(array):
    priceUSD, usdM2, surfaceTotal = array
    if not np.isnan(priceUSD) and np.isnan(usdM2):
        return (priceUSD / usdM2)
    return surfaceTotal
#----------------------------------------------------------------------
# Obtenemos el año y mes del nombre de archivo
#----------------------------------------------------------------------
def addDate(date, df):
    date_splitted = archive.split('-')
    month = date_splitted[3]
    year = date_splitted[2]
    date = int(year + month)
    size = len(df.index)
    dates = pd.Series([date for i in range(0, size)])
    # y lo ponemos como dato en una columna
    df['date'] = dates
    return df
#----------------------------------------------------------------------
# Durante la carga de datos, se eliminan ciertas columnas que nos 
#resultan irrelevantes para el trabajo
#----------------------------------------------------------------------
def filterUnnecesaryColumns(df, deleteId):
    columns = list(df.columns.values)
    for column in columns:
        if column in columnsToUseInitialy:
            continue
        if not deleteId and column == 'id':
            continue
        df.drop(column, axis = 1, inplace = True)
    return df
#----------------------------------------------------------------------
# ADD DESCRIPTIONS COLUMNS
#----------------------------------------------------------------------
def addDescriptionColumns(df, columDict):
    size = len(df.index)
    description = list(df[columDict])
    keys = description[0].keys()
    for key in keys:
        colum = []
        for i in range(0, size):
            value = description[i][key]
            colum.append(value)
        df[key] = colum
    return df
#----------------------------------------------------------------------
# CHANGE PLACE WITH PARENT NAMES COLUMN TO NUMBER
#----------------------------------------------------------------------
def changePlaceWithParentsNamesColumn(df):
    if 'place_with_parent_names' not in df:
        return df
    listPlaces = df['place_with_parent_names'].tolist()
    size = len(listPlaces)
    for i in range(0, size):
        listPlaces[i] = PlaceToNumber(listPlaces[i])
    df['place_with_parent_names'] = listPlaces
    return df
#----------------------------------------------------------------------
# PLACE TO NUMBER
#----------------------------------------------------------------------
def PlaceToNumber(x):
    CF = 0
    GBA = 1
    x = str(x)
    if 'Capital Federal' in x:
        return CF
    else:
        return GBA
#----------------------------------------------------------------------
# CHANGE PROPERTY TYPE COLUMN TO NUMBER
#----------------------------------------------------------------------
def changePropertyTypeColumn(df):    
    listPropertyType = df['property_type'].tolist()
    size = len(listPropertyType)
    for i in range(0, size):
        listPropertyType[i] = propertyTypeToNumber(listPropertyType[i])
    df['property_type'] = listPropertyType
    return df
#----------------------------------------------------------------------
# PROPERTY TYPE TO NUMBER
#----------------------------------------------------------------------
def propertyTypeToNumber(x):
    ph = 0
    apartment = 1
    house = 2
    store = 3
    if x.lower() == "apartment" or x.lower() == "departamento":
        return apartment
    if x.lower() == "ph":
        return ph
    if x.lower() == "house" or x.lower() == "casa":
        return house
    if x.lower() == "store":
        return store
#----------------------------------------------------------------------
# AGG FLOOR
#----------------------------------------------------------------------
def aggFloor(floor):
    if np.isnan(floor):
        return 0
    else:
        return floor
#------------------------------------------------------
# CREATE DICC
#------------------------------------------------------
def createDicc(keys, values):
    dicc = {}
    size = len(keys)
    for i in range(0, size):
        dicc[keys[i]] = values[i]
    return dicc

#------------------------------------------------------
# GET AVG OF VALUES
#------------------------------------------------------
def getAvgOfValues(df, values):
    result = []
    for column in values:
        valuesList = df[column].tolist()
        valuesList = [x for x in valuesList if (not np.isnan(x) and x != 0)]
        size = len(valuesList)
        if size == 0:
            df[column] = np.nan
            continue
        valuesList.sort()
        df[column] = valuesList[size/2]
    return df
#------------------------------------------------------
# GET AVG VALUES BY PLACE
#------------------------------------------------------
def getAvgValuesBykey(df, values, key):
    aux = df
    aux = aux.loc[:, values+[key]]\
    .groupby(key).apply(lambda x: getAvgOfValues(x, values)).reset_index()
    for column in values:
        valuesList = aux[column].tolist()
        keyList = aux[key].tolist()
        dicc = createDicc(keyList, valuesList)

        valuesList = df[column].tolist()
        keyList = df[key].tolist()
        size = len(valuesList)
        for i in range(0, size):
            if not np.isnan(valuesList[i]) and valuesList[i] != 0:
                continue
            ## Caso especial donde Buenos Aire interior 
            ## hay solo una propiedad con todo en nan
            if key == 'state_name' and keyList[i] == 'Buenos Aires Interior':
                avgGBAO = dicc['Bs.As. G.B.A. Zona Oeste']
                avgGBAS = dicc['Bs.As. G.B.A. Zona Sur']
                avgBAI = (avgGBAO + avgGBAS) / 2
                valuesList[i] = avgBAI
            else:
                valuesList[i] = dicc[keyList[i]]
        df[column] = valuesList
    return df

# Loop principal del filtrado de cada dataframe del set de entrenamiento

En esta parte del código se filtraron columnas de los archivos csv que no ibamos a a usar. Tambiénse filtraron propiedades que no pertenezcan a Capital Federal o Gran Buenos Aires, ya que solo vamos a acotar nuestro análisis a estos dos lugares. Tambien fué necesario renombrar la columna de superficie ya que algunos archivos CSV tenian nombres distintos y es necesario que todos refieran al mismo.
Luego agregamos la columna llamada Date la cual refiere a la fecha en la cual fue publicada cada propiedad. Se uso esta fecha ya que la columna 'created_on' no refleja la evolucion del precio de las propiedades en funcion del tiempo y distorciona el analisis de datos. En cambio la fecha de publicacion muestra la actualizacion a la fecha de los precios de cada propiedad.

Tambien se filtraron porpiedades en función de la validez de sus datos. Es decir, que si el valor en dolares por metro cuadradro multiplicado por la superficie no se encontraba en un rango menor al 10% respecto del precio en dolares, se descartaban.

In [27]:
"""# Ruta de la carpeta con los archivos de datos modificados
root = "./properties/"
indexAcum = 0
for archive in listdir(root):
    if ".csv" not in archive:
        continue
    df = pd.read_csv(root + archive, low_memory = False)
    
    df = df.loc[df.place_with_parent_names.str.contains('Capital Federal') \
        | df.place_with_parent_names.str.contains('Bs.As. G.B.A.'), :]
    
    # En algunos casos, es necesario renombrar algunas columnas
    if 'price_aprox_usd' not in df:
        df.rename(columns = {'price': 'price_aprox_usd'}, inplace = True)
    if 'surface_total_in_m2' not in df:
        df.rename(columns = {'surface_in_m2': 'surface_total_in_m2'}, \
            inplace = True)
    if 'surface_covered_in_m2' not in df and 'surface_total_in_m2' in df:
        df['surface_covered_in_m2'] = df['surface_total_in_m2'].tolist()

    # Durante la carga de datos, se eliminan ciertas columnas que nos 
    # resultan irrelevantes para el trabajo.
    df = filterUnnecesaryColumns(df, True)
        
    # Aquí reconvertimos algunas columnas a punto flotante
    df.loc[:, 'price_aprox_usd'] = df.loc[:, ['price_aprox_usd']]\
    .apply(lambda x: float(x), axis = 1)
    df.loc[:, 'price_usd_per_m2'] = df.loc[:, ['price_usd_per_m2']]\
    .apply(lambda x: float(x), axis = 1)

    # Obtenemos el año y mes del nombre de archivo
    df = addDate(archive, df)
    
    # Aquí aplicamos el filtro antes declarado
    df['filter1'] = df.loc[:, ['price_aprox_usd', 'price_usd_per_m2', \
    'surface_total_in_m2']].apply(lambda x: filterImposibles(x), axis = 1)
    df = df[df['filter1'] == 1]
    df.drop('filter1', axis = 1, inplace = True)
    
    size = len(df.index)
    if size == 0:
        continue
    
    #print "columns: ", list(df.columns.values)
    
    df.loc[:, ['price_aprox_usd']] = df.loc[:, ['price_aprox_usd', \
    'price_usd_per_m2', 'surface_total_in_m2']].apply(lambda x: fillPrice(x), axis = 1)
    
    df.loc[:, ['price_usd_per_m2']] = df.loc[:, ['price_aprox_usd', \
    'price_usd_per_m2', 'surface_total_in_m2']].apply(lambda x: fillM2(x), axis = 1)
    
    df.loc[:, ['surface_total_in_m2']] = df.loc[:, ['price_aprox_usd', \
    'price_usd_per_m2', 'surface_total_in_m2']].apply(lambda x: fillSurface(x), axis = 1)
    
    df['filter2'] = df.loc[:, ['price_aprox_usd', 'price_usd_per_m2', \
    'surface_total_in_m2']].apply(lambda x: filterPercentage(x), axis = 1)
    df = df[df['filter2'] == 1]
    df.drop('filter2', axis = 1, inplace = True)
    
    #-------------------------------------------------------------
    df.loc[:, ['place_with_parent_names']] = df.loc[:, ['place_with_parent_names']]\
    .apply(lambda x: PlaceToNumber(x), axis = 1)
    df = changePropertyTypeColumn(df)
    #-------------------------------------------------------------
    
    # Si el filtrado es tal que me quedo sin dataframe, 
    # entonces salto a la siguiente iteracion
    size = len(df.index)
    if size == 0:
        continue
    
    #Obtengo los campos de descripcion
    df['rooms'] = getRooms(df)
    df = getAvgValuesBykey(df, ['rooms'], 'place_name')
    df['description'] = crear_diccionario_descripcion(df)
    df = addDescriptionColumns(df, 'description')
    df.drop('description', axis = 1, inplace = True)
    
    # Borro registros con algun nan
    df.dropna(axis=0, how='any', subset=list(df.columns.values), inplace=True)
    
    # Agrego distancias
    df = aggSubways(df)
    df = aggUniversities(df)
    df = aggHospitales(df)
    df = aggBusStops(df)
    
    # Finalmente, guardamos los archivos modificados.
    size = len(df.index)
    newIndex = [i for i in range(0, size)]
    df.reindex(newIndex)
    print archive
    df.to_csv("./PreProccess/"+archive, index = True, header = True, \
        sep = ',', encoding = 'utf-8-sig')"""

properati-AR-2017-05-01-properties-sell.csv
properati-AR-2014-01-01-properties-sell.csv
properati-AR-2013-08-01-properties-sell.csv
properati-AR-2015-04-01-properties-sell.csv
properati-AR-2014-02-01-properties-sell.csv
properati-AR-2013-09-01-properties-sell.csv
properati-AR-2016-09-01-properties-sell.csv
properati-AR-2017-01-01-properties-sell.csv
properati-AR-2013-10-01-properties-sell.csv
properati-AR-2015-05-01-properties-sell.csv
properati-AR-2013-12-01-properties-sell.csv
properati-AR-2014-05-01-properties-sell.csv
properati-AR-2014-11-01-properties-sell.csv
properati-AR-2014-09-01-properties-sell.csv
properati-AR-2015-02-01-properties-sell.csv
properati-AR-2016-08-01-properties-sell.csv
properati-AR-2016-01-01-properties-sell.csv
properati-AR-2016-11-01-properties-sell.csv
properati-AR-2015-08-01-properties-sell.csv
properati-AR-2017-03-01-properties-sell.csv
properati-AR-2014-12-01-properties-sell.csv
properati-AR-2015-12-01-properties-sell.csv
properati-AR-2016-06-01-properti

Concateno los archivos del set de entrenamiento previamente filtrados

In [35]:
properties = []
root = "./PreProccess/"
for archive in listdir(root):
    if ".csv" not in archive:
        continue
    df = pd.read_csv(root + archive, low_memory = False)
    properties.append(df)
    
general = pd.concat(properties)
general = deleteTrashColumns(general)
#Grabo la concatenacion en un unico csv
try:
    general.to_csv("propertiesTrain.csv", index = True, header = True, \
        sep = ',', encoding = 'utf-8-sig')
    print('Done')
except value:
    print('Error')

Done


# Proceso de filtrado y acomodamiento de los datos para entrenamiento

Estas funciones las uso para filtrar datos especificos del set de etrenamiento que ya fue concatenado, y ademas filtramos el set de datos con las funciones que tenemos a continuacion y con las funciones previamente declaradas para quedarnos con las columnas a evaluear. Tener en cunta que como el set de datos a predecir tiene campos importantes como la superficie, barrios, etc, con algunos Nan, es necesario completar estos con datos propios del mismo set de datos. Esto se puede lograr mediante el campo de desciripcion qur tiene informacion, per en muchos caso la informacion del campo de descripcion no es fiables, entonces se usa por ejempli para la superfiie, la mediana de cada barrio de manera de equilibrar los valores intermedios

In [93]:
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# FUNCTIONS OF PROCCES OF TRAIN DATA
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#---------------------------------------------------------
# HASH PLACES
#---------------------------------------------------------
def hashPlaces(df, predictDf):
    placesHash = {}
    groupedPlaces = df.loc[:, ['place_name', 'floor']]\
    .groupby('place_name').agg([np.size]).reset_index()
    places = groupedPlaces['place_name'].tolist()
    groupedPlaces = predictDf.loc[:, ['place_name', 'floor']]\
    .groupby('place_name').agg([np.size]).reset_index()
    places = places + groupedPlaces['place_name'].tolist()
    size = len(places)
    for i in range(0, size):
        if places[i] in placesHash:
            continue
        placesHash[places[i]] = 0
    keys = placesHash.keys()
    size = len(keys)
    for i in range(0, size):
        placesHash[keys[i]] = i
    placesHash["no place"] = size
    return placesHash
#---------------------------------------------------------
# PROCCES NO PLACES
#---------------------------------------------------------
def processNoPlaces(df):
    places = df['place_name'].tolist()
    size = len(places)
    for i in range(0, size):
        if type(places[i]) != type(""):
            places[i] = "no place"
    df['place_name'] = places
    return df
#---------------------------------------------------------
# CNVERT PLACES TO HASH NUMBER
#---------------------------------------------------------
def convertPlacesToHashNumber(df, placesHash):   
    placesDataTrain = df['place_name'].tolist()
    size = len(placesDataTrain)
    count = 0
    hashSize = len(placesHash.keys())
    for i in range(0, size):
        placeName = placesDataTrain[i]
        placesDataTrain[i] = placesHash[placeName]
    df['place_name'] = placesDataTrain
    return df
#---------------------------------------------------------
# DELETE EXTRA COLUMNS
#---------------------------------------------------------
def deleteExtraColumns(df):
    extra = [
        #'sector de juegos infantiles',
        #'seguridad las 24 hs',
        #'cancha de tenis',
        #'club house',
        #'cochera',
        #'comedor',
        #'futbol 5',
        #'living',
        #'pileta',
        #'piscina',
        #'rooms',
        #'place_with_parent_names',
        'date',
        #'surface_covered_in_m2',
        #'distanceBusStops',
        #'distanceHospitales',
        #'distanceSubway',
        #'distanceUniversities',
    ]
    columns = [
        'Unnamed: 0',  
        'price_usd_per_m2',
    ]
    df.drop(columns+extra, axis = 1, inplace = True)
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df
#---------------------------------------------------------
# DELETE TRASH COLUMNS
#---------------------------------------------------------
def deleteTrashColumns(df):
    columns = list(df.columns.values)
    for column in columns:
        if "unnamed" in column.lower():
            df.drop(column, axis = 1, inplace = True) 
    return df

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# FUNCTIONS OF PROCCES OF TEST DATA
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#---------------------------------------------------------
# DELETE COLUMNS TEST DATA
#---------------------------------------------------------
def deleteColumnsTestData(df, columnsToEvaluate):
    columns = list(df.columns.values)
    for column in columns:
        if column in columnsToEvaluate:
            continue
        df.drop(column, axis = 1, inplace = True)
    return df
#---------------------------------------------------------
# PRE PROCCES TEST DATA
#---------------------------------------------------------
def preProcessTestData(df, placesHash):
    df['description'] = crear_diccionario_descripcion(df)
    df = processNoPlaces(df)
    df = convertPlacesToHashNumber(df, placesHash)
    df = addDescriptionColumns(df, 'description')
    df.drop('description', axis = 1, inplace = True)
    df = changePlaceWithParentsNamesColumn(df)
    df = changePropertyTypeColumn(df)
    return df
#---------------------------------------------------------
# FILTER BY DATE
#---------------------------------------------------------
def filterByDate(df):
    dates = df['date'].tolist()
    size = len(dates)
    for i in range(0, size):
        if dates[i]/100 < 2016:
            dates[i] = 0
    df['date'] = dates
    df = df[df['date'] != 0]
    return df

#  Second filter proccess 

In [94]:
df = pd.read_csv("propertiesTrain.csv", low_memory = False)
df = filterByDate(df)
predictDf = pd.read_csv("properati_dataset_testing_noprice.csv", low_memory = False)
df = deleteExtraColumns(df)
df.dropna(axis=0, how='any', subset=list(df.columns.values), inplace=True)
placesHash = hashPlaces(df, predictDf)
df = processNoPlaces(df)
df = convertPlacesToHashNumber(df, placesHash)
df.to_csv("dataTrain.csv", index = True, header = True, sep = ',', 
          encoding = 'utf-8-sig')

In [95]:
df = pd.read_csv("dataTrain.csv", low_memory = False)
df = deleteTrashColumns(df)

In [96]:
columnsToEvaluate = list(df.columns.values)
print "column size: ", len(columnsToEvaluate)
columnsToEvaluate

column size:  23


['cancha de tenis',
 'club house',
 'cochera',
 'comedor',
 'distanceBusStops',
 'distanceHospitales',
 'distanceSubway',
 'distanceUniversities',
 'futbol 5',
 'lat',
 'living',
 'lon',
 'pileta',
 'piscina',
 'place_name',
 'place_with_parent_names',
 'price_aprox_usd',
 'property_type',
 'rooms',
 'sector de juegos infantiles',
 'seguridad las 24 hs',
 'surface_covered_in_m2',
 'surface_total_in_m2']

# PreProccess data test

In [97]:
predictDf = pd.read_csv("properati_dataset_testing_noprice.csv", low_memory = False)

In [98]:
predictDf = getAvgValuesBykey(predictDf, ['lat', 'lon'], 'place_name')
predictDf = getAvgValuesBykey(predictDf, ['lat', 'lon'], 'state_name')
predictDf = getAvgValuesBykey(predictDf, ['surface_total_in_m2'], 'place_name')
predictDf = getAvgValuesBykey(predictDf, ['surface_total_in_m2'], 'state_name')
predictDf = getAvgValuesBykey(predictDf, ['surface_covered_in_m2'], 'place_name')
predictDf = getAvgValuesBykey(predictDf, ['surface_covered_in_m2'], 'state_name')
predictDf = getAvgValuesBykey(predictDf, ['rooms'], 'place_name')
predictDf = getAvgValuesBykey(predictDf, ['rooms'], 'state_name')
predictDf = aggSubways(predictDf)
predictDf = aggUniversities(predictDf)
predictDf = aggHospitales(predictDf)
predictDf  = aggBusStops(predictDf)
columnsToEvaluate = list(df.columns.values)
columnsToEvaluate.remove('price_aprox_usd')
predictDf['date'] = [201708 for i in range(0, len(predictDf.index))]
predictDf = deleteColumnsTestData(predictDf, columnsToEvaluate+['id', 'description'])
predictDf = preProcessTestData(predictDf, placesHash)
predictDf = deleteColumnsTestData(predictDf, columnsToEvaluate+['id', 'description'])
predictDf.to_csv("dataTest.csv", index = True, header = True, sep = ',', 
          encoding = 'utf-8-sig')
print "Done"

Done


In [99]:
columnsDataTest = list(predictDf.columns.values)
print "column size: ", len(columnsDataTest)

column size:  23
