### Machine Learning desarrollo práctica final de módulo
V_3.2

In [49]:
# En primer lugar, cargamos las librerías necesarias

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

%matplotlib inline 

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

In [50]:
# Empezamos con la incorporación de los datos del fichero csv de Airbnb para generar nuestro dataset
# Y visualizamos los primeros elementos de forma transversal

airbnb_Data = pd.read_csv("./Data/airbnb-listings.csv", sep=";", decimal=".")
airbnb_Data.head(5).T

Unnamed: 0,0,1,2,3,4
ID,13994948,9474923,6766647,5266080,2476666
Listing Url,https://www.airbnb.com/rooms/13994948,https://www.airbnb.com/rooms/9474923,https://www.airbnb.com/rooms/6766647,https://www.airbnb.com/rooms/5266080,https://www.airbnb.com/rooms/2476666
Scrape ID,20170402075052,20170512105218,20170306202425,20170407214050,20170407214050
Last Scraped,2017-04-02,2017-05-12,2017-03-07,2017-04-08,2017-04-08
Name,Cozy Duplex in De Pijp,Sunny&cozy flat in residential area,"5 Minutes to Downtown, Zilker!",Habitación doble muy tranquila en centro Barce...,thesuites BARCELONA
Summary,An exceptional and quite house with all you ne...,My comfortable apartment is in a quiet neighbo...,"Get everywhere in minutes! Charming, tradition...","Habitación de matrimonio, estupendamente ubica...",A modernist building from the late nineteenth ...
Space,"The house has 2 floors. In the first floor, yo...",Bright living room with open view. Newly renov...,Located on a large tree-covered lot less than ...,"Es tranquilo y bien situado, al estar orienta...",A modernist building from the late nineteenth ...
Description,An exceptional and quite house with all you ne...,My comfortable apartment is in a quiet neighbo...,"Get everywhere in minutes! Charming, tradition...","Habitación de matrimonio, estupendamente ubica...",A modernist building from the late nineteenth ...
Experiences Offered,none,none,none,none,none
Neighborhood Overview,De Pijp is one of the most cosmopolitan and vi...,The new South 'Zuid' is a trendy area with wid...,The Tarrytown neighborhood is best known for i...,"Mi barrio me encanta, porque no es bullicioso ...","The building is located in Central Barcelona, ..."


In [51]:
#Calculamos el número de muestras y la dimensionalidad inicial.
N, d = airbnb_Data.shape
print(f'Numero de muestras actual es de {N} y la dimensionalidad es de {d}')

Numero de muestras actual es de 14780 y la dimensionalidad es de 89


In [52]:
# Procedemos a eliminar rows que creemos no son necesarias de airbnb_train y lo mismo aplicaremos a airbnb_test.

airbnb_Data.drop(['Listing Url', 'Scrape ID', 'Last Scraped','Name','Summary','Space','Description','Experiences Offered'
                  ,'Neighborhood Overview','Notes','Transit','Access','Interaction','House Rules',
                  'Thumbnail Url','Medium Url', 'Picture Url',
                'XL Picture Url', 'Host ID', 'Host URL','Host Name','Host About','Host Thumbnail Url'
                  ,'Host Picture Url','Host Neighbourhood','Host Verifications','Street','Neighbourhood','Features',
                 'Host Location','Neighbourhood Cleansed','License','Jurisdiction Names','Host Acceptance Rate'
                 ,'Amenities','Calendar last Scraped','Calendar Updated','Host Since','State','Market'
                ,'Smart Location','Has Availability','First Review','Last Review','Weekly Price','Monthly Price',
                 'Host Response Time','Geolocation'],
                 axis=1,inplace=True)


In [53]:
print(f'Dimensiones del dataset de training: {airbnb_Data.shape}')

Dimensiones del dataset de training: (14780, 41)


In [54]:
airbnb_Data.head(5).T

Unnamed: 0,0,1,2,3,4
ID,13994948,9474923,6766647,5266080,2476666
Host Response Rate,100,,100,100,100
Host Listings Count,2,1,1,2,21
Host Total Listings Count,2,1,1,2,21
Neighbourhood Group Cleansed,,,,Sant Martí,Sarrià-Sant Gervasi
City,Amsterdam,Antwerpen,Austin,Barcelona,Barcelona
Zipcode,1074 VM,2018,78703,08018,08021
Country Code,NL,BE,US,ES,ES
Country,Netherlands,Belgium,United States,Spain,Spain
Latitude,52.3527,51.2048,30.2871,41.3947,41.3963


In [55]:
#Verificamos el nombre de de cada columna(dimensión)
feature_names = airbnb_Data.columns[0:]
feature_names

Index(['ID', 'Host Response Rate', 'Host Listings Count',
       'Host Total Listings Count', 'Neighbourhood Group Cleansed', 'City',
       'Zipcode', 'Country Code', 'Country', 'Latitude', 'Longitude',
       'Property Type', 'Room Type', 'Accommodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Bed Type', 'Square Feet', 'Price', 'Security Deposit',
       'Cleaning Fee', 'Guests Included', 'Extra People', 'Minimum Nights',
       'Maximum Nights', 'Availability 30', 'Availability 60',
       'Availability 90', 'Availability 365', 'Number of Reviews',
       'Review Scores Rating', 'Review Scores Accuracy',
       'Review Scores Cleanliness', 'Review Scores Checkin',
       'Review Scores Communication', 'Review Scores Location',
       'Review Scores Value', 'Cancellation Policy',
       'Calculated host listings count', 'Reviews per Month'],
      dtype='object')

In [56]:
# En esta celda, filtraremos las poblaciones que contengan Madrid
# El csv descargado debía contener únicamente datos pertenecientes a Madrid

#Borramos todos los valores nulo en City
airbnb_Data.dropna(subset=['City'],inplace=True)

#Cargamos el Dataset con las poblaciones que contengan Madrid no teniendo en cuenta mayúsculas y minúsculas.
airbnb_Data = airbnb_Data[airbnb_Data["City"].str.contains('Madrid', na=False, case=False) ]

#Mostramos un conteo de todos los resultados agrupados por City
airbnb_Data["City"].value_counts()

Madrid                                 13207
Madrid, Comunidad de Madrid, ES           20
madrid                                     8
MADRID                                     2
Aravaca (Madrid)                           1
Comunidad de Madrid                        1
Centro, Madrid                             1
las matas  madrid                          1
Chueca, Madrid                             1
Madrid, Comunidad de Madrid, ESPANA        1
Delicias-Madrid                            1
Madrid, Vallecas (Fontarrón)               1
Name: City, dtype: int64

In [57]:
print('Datos airbnb antes del borrado de NaN en Price: ', airbnb_Data.shape)

Datos airbnb antes del borrado de NaN en Price:  (13245, 41)


In [58]:
#Nuestro target es el precio, por lo tanto no podemos admitir registros sin precio en train
airbnb_Data.dropna(subset=['Price'],inplace=True)

In [59]:
print('Datos airbnb después del borrado de NaN en Price: ', airbnb_Data.shape)

Datos airbnb después del borrado de NaN en Price:  (13236, 41)


In [60]:
#Transformamos los Feets a Metros en la superficie de los pisos, pero antes deberemos tratar los nulos
airbnb_Data['Square Feet'].fillna(0, inplace = True)

#Transformamos Feets a Metros en train
airbnb_Data[['Square Feet']] = airbnb_Data[['Square Feet']].apply(lambda x: x * 0.3048 * 0.3048)

In [61]:
# Guardamos físicamente en disco el dataset con una primera limpieza.
airbnb_Data.to_csv('./Data/airbnb_clean.csv', sep=';', decimal='.', index=False)