In [82]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [70]:
df = pd.read_csv("G:/Coding/Datasets/airbnb.csv")

In [71]:
df.head()

Unnamed: 0,Listings id,Last year reviews,Host since,Host is superhost,Host number of listings,Neighbourhood,Beds number,Bedrooms number,Property type,Maximum allowed guests,...,Communication score,Location score,Value for money score,Reviews per month,City,Season,Bathrooms number,Bathrooms type,Coordinates,Date of scraping
0,31840,6,2011-02-07,Host,44.0,Centro Storico,1.0,1.0,Private room,2,...,4.91,4.91,4.7,0.78,Firenze,Early Winter,1,private,"43.77709, 11.25216",2024-12-15
1,222527,0,2011-07-11,Superhost,3.0,Rifredi,3.0,2.0,Entire home,4,...,4.96,4.6,4.86,1.92,Firenze,Early Winter,1,private,"43.82005, 11.22004",2024-12-15
2,32120,6,2010-03-26,Host,1.0,Gavinana Galluzzo,1.0,1.0,Entire home,2,...,4.96,4.63,4.63,0.16,Firenze,Early Winter,1,private,"43.76157, 11.27741",2024-12-15
3,224562,9,2011-09-16,Host,2.0,Centro Storico,1.0,1.0,Entire home,4,...,4.83,4.93,4.64,0.71,Firenze,Early Winter,1,private,"43.772, 11.26142",2024-12-15
4,32180,11,2014-04-05,Superhost,1.0,Centro Storico,4.0,2.0,Entire home,4,...,4.71,4.81,4.84,0.21,Firenze,Early Winter,2,private,"43.76832, 11.24348",2024-12-15


In [72]:
#lets check if there are any null values in any column
df.isnull().sum()

Listings id                0
Last year reviews          0
Host since                 0
Host is superhost          0
Host number of listings    0
Neighbourhood              0
Beds number                0
Bedrooms number            0
Property type              0
Maximum allowed guests     0
Price                      0
Total reviews              0
Rating score               0
Accuracy score             0
Cleanliness score          0
Checkin score              0
Communication score        0
Location score             0
Value for money score      0
Reviews per month          0
City                       0
Season                     0
Bathrooms number           0
Bathrooms type             0
Coordinates                0
Date of scraping           0
dtype: int64

In [73]:
# checking if there are any duplicated data
df.duplicated().sum()

0

In [74]:
# checking the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282047 entries, 0 to 282046
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Listings id              282047 non-null  int64  
 1   Last year reviews        282047 non-null  int64  
 2   Host since               282047 non-null  object 
 3   Host is superhost        282047 non-null  object 
 4   Host number of listings  282047 non-null  float64
 5   Neighbourhood            282047 non-null  object 
 6   Beds number              282047 non-null  float64
 7   Bedrooms number          282047 non-null  float64
 8   Property type            282047 non-null  object 
 9   Maximum allowed guests   282047 non-null  int64  
 10  Price                    282047 non-null  float64
 11  Total reviews            282047 non-null  int64  
 12  Rating score             282047 non-null  float64
 13  Accuracy score           282047 non-null  float64
 14  Clea

In [75]:
# as we got to know the data types. now we can convert few data types and also drop some of the data as it wont be useful in predictions. 
# we can remove the listing id, date of scraping since both columns will not be useful in prediction. 

df.drop(['Listings id','Date of scraping'], axis=1 , inplace = True)

In [76]:
df.head()

Unnamed: 0,Last year reviews,Host since,Host is superhost,Host number of listings,Neighbourhood,Beds number,Bedrooms number,Property type,Maximum allowed guests,Price,...,Checkin score,Communication score,Location score,Value for money score,Reviews per month,City,Season,Bathrooms number,Bathrooms type,Coordinates
0,6,2011-02-07,Host,44.0,Centro Storico,1.0,1.0,Private room,2,89.0,...,4.85,4.91,4.91,4.7,0.78,Firenze,Early Winter,1,private,"43.77709, 11.25216"
1,0,2011-07-11,Superhost,3.0,Rifredi,3.0,2.0,Entire home,4,300.0,...,4.99,4.96,4.6,4.86,1.92,Firenze,Early Winter,1,private,"43.82005, 11.22004"
2,6,2010-03-26,Host,1.0,Gavinana Galluzzo,1.0,1.0,Entire home,2,95.0,...,4.84,4.96,4.63,4.63,0.16,Firenze,Early Winter,1,private,"43.76157, 11.27741"
3,9,2011-09-16,Host,2.0,Centro Storico,1.0,1.0,Entire home,4,60.0,...,4.76,4.83,4.93,4.64,0.71,Firenze,Early Winter,1,private,"43.772, 11.26142"
4,11,2014-04-05,Superhost,1.0,Centro Storico,4.0,2.0,Entire home,4,105.0,...,4.81,4.71,4.81,4.84,0.21,Firenze,Early Winter,2,private,"43.76832, 11.24348"


In [77]:
#convert host since to date time data types
df['Host since'] = pd.to_datetime(df['Host since'])

In [78]:
# lets calculate the tenure of each host till now
df['Host tenure'] = (pd.to_datetime('today') - df['Host since']).dt.days

In [79]:
#now lets get all the datatypes that are object 
df.select_dtypes(include=['object'])

Unnamed: 0,Host is superhost,Neighbourhood,Property type,City,Season,Bathrooms type,Coordinates
0,Host,Centro Storico,Private room,Firenze,Early Winter,private,"43.77709, 11.25216"
1,Superhost,Rifredi,Entire home,Firenze,Early Winter,private,"43.82005, 11.22004"
2,Host,Gavinana Galluzzo,Entire home,Firenze,Early Winter,private,"43.76157, 11.27741"
3,Host,Centro Storico,Entire home,Firenze,Early Winter,private,"43.772, 11.26142"
4,Superhost,Centro Storico,Entire home,Firenze,Early Winter,private,"43.76832, 11.24348"
...,...,...,...,...,...,...,...
282042,Host,Santa Croce,Entire home,Venezia,Early Autumn,private,"45.44175, 12.32484"
282043,Host,Dorsoduro,Entire home,Venezia,Early Autumn,private,"45.43524874231471, 12.3209960013628"
282044,Host,Piave 1860,Entire home,Venezia,Early Autumn,private,"45.4859248, 12.2292505"
282045,Host,San Lorenzo XXV Aprile,Entire home,Venezia,Early Autumn,private,"45.49259973446829, 12.244685252349434"


In [80]:
df['Property type'].unique()

array(['Private room', 'Entire home', 'Hotel room', 'Shared room'],
      dtype=object)