# Imports & Data

In [1]:
import pandas as pd
import numpy as np
import geopandas


### Data

In [2]:
data_d = pd.read_csv('data/Airbnb_Mx_December.csv.gz')
data_s = pd.read_csv('data/Airbnb_Mx_September.csv.gz')
data_d.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,35797,https://www.airbnb.com/rooms/35797,20211225071008,2021-12-26,Villa Dante,"Dentro de Villa un estudio de arte con futon, ...","Centro comercial Santa Fe, parque interlomas y...",https://a0.muscache.com/pictures/f395ab78-1185...,153786,https://www.airbnb.com/users/show/153786,...,,,,,f,1,1,0,0,
1,44616,https://www.airbnb.com/rooms/44616,20211225071008,2021-12-25,CONDESA HAUS B&B,A new concept of hosting in mexico through a b...,,https://a0.muscache.com/pictures/251410/ec75fe...,196253,https://www.airbnb.com/users/show/196253,...,4.76,4.98,4.45,,f,10,2,1,0,0.41


In [3]:
data_s.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,35797,https://www.airbnb.com/rooms/35797,20210928184726,2021-09-30,Villa Dante,"Dentro de Villa un estudio de arte con futon, ...","Centro comercial Santa Fe, parque interlomas y...",https://a0.muscache.com/pictures/f395ab78-1185...,153786,https://www.airbnb.com/users/show/153786,...,,,,,f,1,1,0,0,
1,44616,https://www.airbnb.com/rooms/44616,20210928184726,2021-09-29,CONDESA HAUS B&B,A new concept of hosting in mexico through a b...,,https://a0.muscache.com/pictures/251410/ec75fe...,196253,https://www.airbnb.com/users/show/196253,...,4.76,4.98,4.45,,f,10,2,1,0,4.26


#### Sum DataFrames

In [4]:
frames = [data_d, data_s]
  
data = pd.concat(frames)
data.head(2)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,35797,https://www.airbnb.com/rooms/35797,20211225071008,2021-12-26,Villa Dante,"Dentro de Villa un estudio de arte con futon, ...","Centro comercial Santa Fe, parque interlomas y...",https://a0.muscache.com/pictures/f395ab78-1185...,153786,https://www.airbnb.com/users/show/153786,...,,,,,f,1,1,0,0,
1,44616,https://www.airbnb.com/rooms/44616,20211225071008,2021-12-25,CONDESA HAUS B&B,A new concept of hosting in mexico through a b...,,https://a0.muscache.com/pictures/251410/ec75fe...,196253,https://www.airbnb.com/users/show/196253,...,4.76,4.98,4.45,,f,10,2,1,0,0.41


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38818 entries, 0 to 19162
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            38818 non-null  int64  
 1   listing_url                                   38818 non-null  object 
 2   scrape_id                                     38818 non-null  int64  
 3   last_scraped                                  38818 non-null  object 
 4   name                                          38812 non-null  object 
 5   description                                   36451 non-null  object 
 6   neighborhood_overview                         25788 non-null  object 
 7   picture_url                                   38818 non-null  object 
 8   host_id                                       38818 non-null  int64  
 9   host_url                                      38818 non-null 

In [6]:
(data[['neighbourhood_group_cleansed','bathrooms', 'calendar_updated','license']]).head(4)

Unnamed: 0,neighbourhood_group_cleansed,bathrooms,calendar_updated,license
0,,,,
1,,,,
2,,,,
3,,,,


In [7]:
data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

For the next step, we'll remove a lot of the unnecessary columns, as they just take up space and I don't think they're useful for the model.

In [8]:
# columns we are going to drop
col= ['listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'amenities', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count', 'neighborhood_overview',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood', 'neighbourhood_group_cleansed','bathrooms',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'license', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month']

In [9]:
df_airbnb = data.drop(col, axis=1)
df_airbnb.head(4)

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,price,number_of_reviews
0,35797,Cuajimalpa de Morelos,19.38283,-99.27178,Entire villa,Entire home/apt,2,1 bath,1.0,1.0,"$4,123.00",0
1,44616,Cuauhtémoc,19.41162,-99.17794,Entire residential home,Entire home/apt,14,5.5 baths,5.0,8.0,"$15,000.00",50
2,56074,Cuauhtémoc,19.43977,-99.15605,Entire condominium (condo),Entire home/apt,2,1 bath,1.0,1.0,$600.00,66
3,61792,Cuauhtémoc,19.41083,-99.18057,Private room in residential home,Private room,2,1 private bath,1.0,1.0,"$1,237.00",53


In the dataframe, the bathroom column is empty, so we only have the bathroom s_text column, which we will convert to a more manageable format.

In [10]:
df_airbnb['bathrooms_text'] = df_airbnb['bathrooms_text'].str.extract("(\d*\.?\d+)", expand=True).astype('float')
df_airbnb.rename(columns = {'bathrooms_text':'bathrooms'}, inplace = True)
df_airbnb.head(10)

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews
0,35797,Cuajimalpa de Morelos,19.38283,-99.27178,Entire villa,Entire home/apt,2,1.0,1.0,1.0,"$4,123.00",0
1,44616,Cuauhtémoc,19.41162,-99.17794,Entire residential home,Entire home/apt,14,5.5,5.0,8.0,"$15,000.00",50
2,56074,Cuauhtémoc,19.43977,-99.15605,Entire condominium (condo),Entire home/apt,2,1.0,1.0,1.0,$600.00,66
3,61792,Cuauhtémoc,19.41083,-99.18057,Private room in residential home,Private room,2,1.0,1.0,1.0,"$1,237.00",53
4,67703,Cuauhtémoc,19.41152,-99.16857,Entire rental unit,Entire home/apt,4,1.0,2.0,3.0,"$1,959.00",39
5,70644,Coyoacán,19.35448,-99.16217,Entire rental unit,Entire home/apt,2,1.0,1.0,1.0,"$1,606.00",106
6,107078,Miguel Hidalgo,19.43211,-99.19327,Entire loft,Entire home/apt,2,1.5,1.0,1.0,"$3,878.00",10
7,131610,Coyoacán,19.35416,-99.16488,Private room,Private room,2,1.0,1.0,1.0,"$1,403.00",0
8,165772,Miguel Hidalgo,19.40826,-99.18659,Entire residential home,Entire home/apt,14,4.5,4.0,9.0,"$4,125.00",273
9,171109,Benito Juárez,19.39675,-99.17581,Private room in rental unit,Private room,2,1.0,1.0,1.0,$299.00,76


We remove the $ symbol from the price column

In [11]:
df_airbnb['price'] = df_airbnb['price'].str.replace(r"[\$\,]", "").astype(float).round().astype(int)
df_airbnb.head(10)

  df_airbnb['price'] = df_airbnb['price'].str.replace(r"[\$\,]", "").astype(float).round().astype(int)


Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews
0,35797,Cuajimalpa de Morelos,19.38283,-99.27178,Entire villa,Entire home/apt,2,1.0,1.0,1.0,4123,0
1,44616,Cuauhtémoc,19.41162,-99.17794,Entire residential home,Entire home/apt,14,5.5,5.0,8.0,15000,50
2,56074,Cuauhtémoc,19.43977,-99.15605,Entire condominium (condo),Entire home/apt,2,1.0,1.0,1.0,600,66
3,61792,Cuauhtémoc,19.41083,-99.18057,Private room in residential home,Private room,2,1.0,1.0,1.0,1237,53
4,67703,Cuauhtémoc,19.41152,-99.16857,Entire rental unit,Entire home/apt,4,1.0,2.0,3.0,1959,39
5,70644,Coyoacán,19.35448,-99.16217,Entire rental unit,Entire home/apt,2,1.0,1.0,1.0,1606,106
6,107078,Miguel Hidalgo,19.43211,-99.19327,Entire loft,Entire home/apt,2,1.5,1.0,1.0,3878,10
7,131610,Coyoacán,19.35416,-99.16488,Private room,Private room,2,1.0,1.0,1.0,1403,0
8,165772,Miguel Hidalgo,19.40826,-99.18659,Entire residential home,Entire home/apt,14,4.5,4.0,9.0,4125,273
9,171109,Benito Juárez,19.39675,-99.17581,Private room in rental unit,Private room,2,1.0,1.0,1.0,299,76


## 

In [12]:
airbnb = geopandas.GeoDataFrame(df_airbnb,
           geometry=geopandas.points_from_xy(
           df_airbnb.longitude, df_airbnb.latitude))

airbnb.head(8)                  

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,geometry
0,35797,Cuajimalpa de Morelos,19.38283,-99.27178,Entire villa,Entire home/apt,2,1.0,1.0,1.0,4123,0,POINT (-99.27178 19.38283)
1,44616,Cuauhtémoc,19.41162,-99.17794,Entire residential home,Entire home/apt,14,5.5,5.0,8.0,15000,50,POINT (-99.17794 19.41162)
2,56074,Cuauhtémoc,19.43977,-99.15605,Entire condominium (condo),Entire home/apt,2,1.0,1.0,1.0,600,66,POINT (-99.15605 19.43977)
3,61792,Cuauhtémoc,19.41083,-99.18057,Private room in residential home,Private room,2,1.0,1.0,1.0,1237,53,POINT (-99.18057 19.41083)
4,67703,Cuauhtémoc,19.41152,-99.16857,Entire rental unit,Entire home/apt,4,1.0,2.0,3.0,1959,39,POINT (-99.16857 19.41152)
5,70644,Coyoacán,19.35448,-99.16217,Entire rental unit,Entire home/apt,2,1.0,1.0,1.0,1606,106,POINT (-99.16217 19.35448)
6,107078,Miguel Hidalgo,19.43211,-99.19327,Entire loft,Entire home/apt,2,1.5,1.0,1.0,3878,10,POINT (-99.19327 19.43211)
7,131610,Coyoacán,19.35416,-99.16488,Private room,Private room,2,1.0,1.0,1.0,1403,0,POINT (-99.16488 19.35416)


In [13]:
airbnb = airbnb.drop(['latitude','longitude'], axis=1)
airbnb.columns

Index(['id', 'neighbourhood_cleansed', 'property_type', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'number_of_reviews', 'geometry'],
      dtype='object')

In [31]:
airbnb.head()

Unnamed: 0,id,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,geometry
0,35797,Cuajimalpa de Morelos,Entire villa,Entire home/apt,2,1.0,1.0,1.0,4123,0,POINT (-99.27178 19.38283)
1,44616,Cuauhtémoc,Entire residential home,Entire home/apt,14,5.5,5.0,8.0,15000,50,POINT (-99.17794 19.41162)
2,56074,Cuauhtémoc,Entire condominium (condo),Entire home/apt,2,1.0,1.0,1.0,600,66,POINT (-99.15605 19.43977)
3,61792,Cuauhtémoc,Private room in residential home,Private room,2,1.0,1.0,1.0,1237,53,POINT (-99.18057 19.41083)
4,67703,Cuauhtémoc,Entire rental unit,Entire home/apt,4,1.0,2.0,3.0,1959,39,POINT (-99.16857 19.41152)


In [26]:
airbnb_2 = airbnb.drop(["property_type","room_type","neighbourhood_cleansed"], axis=1)

In [39]:
airbnb_2.isna().sum()

id                      0
accommodates            0
bathrooms             143
bedrooms             1427
beds                  853
price                   0
number_of_reviews       0
geometry                0
dtype: int64

In [41]:
airbnb_2.dropna(subset=["bathrooms","bedrooms","beds"],inplace=True)

In [42]:
airbnb_2.isna().sum()

id                   0
accommodates         0
bathrooms            0
bedrooms             0
beds                 0
price                0
number_of_reviews    0
geometry             0
dtype: int64

# Model

In [43]:
from sklearn.model_selection import train_test_split


X = airbnb_2.drop(columns=["id", "price","geometry"])
y = airbnb_2["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.20, random_state=97)

In [45]:
from sklearn.linear_model import LinearRegression

Train_model = LinearRegression()
Train_model.fit(X_train, y_train)

R_2 = Train_model.score(X_train, y_train)
R_2

0.05482022007822507

In [55]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

cur_one_hot = OneHotEncoder(categories="auto")
cur_pipeline = Pipeline([
    ("one_hot", cur_one_hot),
    ("lin_reg", LinearRegression())
])

cur_pipeline.fit(X_train, y_train)
R_2 = cur_pipeline.score(X_train, y_train)
R_2

0.1425741769061175

The model is pretty bad XD, it's a future project