# 6.1 SOurcing Open Data: Airbnb Rome

### 1.Import libraries and datasets
### 2.Data Cleaning
### 3.Data Understanding
### 4.Export

## 1. Import libraries and datasets

In [334]:
import pandas as pd
import numpy as np
import os

In [335]:
# Create a path
path = r'/Users/fatemehshahvirdi/Work-Related/Data Analysis/Data Immersion/Achievement 6/Rome & Sydney Analysis/O2 Data'

In [336]:
# Import Rome dataset
df_rome_june = pd.read_csv(os.path.join(path, 'Original', 'Rome', '10 June 2023', 'listings.csv'))

## 2. Data cleaning

In [337]:
df_rome_june.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,2737,Place to stay in Rome · ★4.80 · 1 bedroom · 1 ...,3047,Elif,,VIII Appia Antica,41.87136,12.48215,Private room,50,7,7,2015-05-28,0.07,6,358,0,
1,181189,Bed and breakfast in Rome · ★4.72 · 1 bedroom ...,868477,Luigi,,I Centro Storico,41.90963,12.45603,Hotel room,100,3,147,2023-06-05,1.03,8,175,6,
2,11834,Rental unit in Rome · ★4.80 · 1 bedroom · 1 be...,44552,Serena,,I Centro Storico,41.895447,12.491181,Entire home/apt,105,2,191,2023-06-02,1.3,1,246,44,
3,49240,Condo in Rome · ★4.69 · 2 bedrooms · 5 beds · ...,224479,Ermanno,,I Centro Storico,41.89738,12.476,Entire home/apt,294,2,70,2023-04-11,0.46,2,151,32,
4,181747,Rental unit in Rome · ★4.56 · 2 bedrooms · 3 b...,871063,Lorenzo,,I Centro Storico,41.90183,12.50397,Entire home/apt,228,2,85,2023-06-09,0.63,2,329,42,


In [338]:
df_rome_june.shape

(26256, 18)

In [339]:
# check all 18 column's names
df_rome_june.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license'],
      dtype='object')

In [340]:
# check the null values
df_rome_june.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                             2
neighbourhood_group               26256
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                        3457
reviews_per_month                  3457
calculated_host_listings_count        0
availability_365                      0
number_of_reviews_ltm                 0
license                           22048
dtype: int64

#### The spell of 'neighbourhood' is incorrect, I will change it to 'neighborhood'
#### and the column 'number_of_reviews_ltm' is unclear (ltm = last 12 month), I also change it to 'num_reviews_last_12m'

In [377]:
# Rename the columns
df_rome_june.rename(columns={
    'neighbourhood': 'neighborhood', 'number_of_reviews_ltm': 'num_reviews_last_12m' , 
    'number_of_reviews': 'num_reviews', 'minimum_nights':'min_nights'
} , inplace= True)

#### 'neighbourhood_group' is all empty, and the 'license' column also is mostly empty and the information it contains is not necessary for the analysis,I also don't need the 'host_name' so I'll remove these 2 columns

In [378]:
columns_to_keep = ['id', 'name', 'host_id',
       'neighborhood', 'latitude', 'longitude', 'room_type', 'price',
       'min_nights', 'num_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m']
df_rome_june_selected = df_rome_june[columns_to_keep]

In [379]:
df_rome_june_selected.columns

Index(['id', 'name', 'host_id', 'neighborhood', 'latitude', 'longitude',
       'room_type', 'price', 'min_nights', 'num_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m'],
      dtype='object')

In [380]:
# check how many observations in 'num_reviews' are equal to 0
num_zero_reviews = (df_rome_june_selected['num_reviews'] == 0).sum()
print("Number of rows where 'num_reviews' is equal to 0:", num_zero_reviews)

Number of rows where 'num_reviews' is equal to 0: 3457


#### The columns 'last_review' and 'reviews_per_month', along with the count of rows where 'number_of_reviews' equals 0, all exhibit the same count of missing values: 3457. Upon closer inspection of the dataset, I observed that these missing values are correlated with instances where 'number_of_reviews' is 0. Initially, I considered creating a flag for this pattern, but upon reflection, I decided against it. The '0' value in 'number_of_reviews' already serves as an implicit flag, indicating observations without any reviews.

In [381]:
df_rome_june_selected.dtypes

id                                  int64
name                               object
host_id                             int64
neighborhood                       object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
min_nights                          int64
num_reviews                         int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
num_reviews_last_12m                int64
dtype: object

In [382]:
# change the data type of the 'last_review' to datetime
df_rome_june_selected.loc[:, 'last_review'] = pd.to_datetime(df_rome_june_selected['last_review'])

In [383]:
df_rome_june_selected.dtypes

id                                  int64
name                               object
host_id                             int64
neighborhood                       object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
min_nights                          int64
num_reviews                         int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
num_reviews_last_12m                int64
dtype: object

In [348]:
# check fo mixed data types
for col in df_rome_june_selected.columns.tolist():
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_rome_june_selected[weird]) > 0:
    print (col)

last_review


  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_selected[[col]].applymap(type) != df_rome_june_selected[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_rome_june_

In [384]:
# Convert 'last_review' column to datetime data type with errors='coerce'
parsed_dates = pd.to_datetime(df_rome_june_selected['last_review'], errors='coerce')

# Find the unique data types in the parsed dates
unique_data_types = parsed_dates.apply(type).unique()

print("Unique data types in the 'last_review' column after parsing:")
for data_type in unique_data_types:
    print(data_type)

Unique data types in the 'last_review' column after parsing:
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.nattype.NaTType'>


In [385]:
# Convert 'last_review' column to datetime data type with errors='coerce'
parsed_dates = pd.to_datetime(df_rome_june_selected['last_review'], errors='coerce')

# Count the number of NaTType values
num_nat_values = parsed_dates.isna().sum()

print("Number of NaTType values in the 'last_review' column:", num_nat_values)

Number of NaTType values in the 'last_review' column: 3457


#### I checked it and the other data type is for the missing data, I have now 3 options:
#### Remove them, impute them, or create a flag
#### for the reason I explained above (The '0' value in 'number_of_reviews' already serves as an implicit flag, indicating observations without any reviews.) I don't take any action.

In [386]:
# check the duplicates
df_dups = df_rome_june_selected[df_rome_june_selected.duplicated()]

In [387]:
df_rome_june_selected.columns

Index(['id', 'name', 'host_id', 'neighborhood', 'latitude', 'longitude',
       'room_type', 'price', 'min_nights', 'num_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m'],
      dtype='object')

# 3. Data Understanding

In [388]:
# basic descriptive analysis
df_rome_june_selected[['price', 'min_nights', 'num_reviews',
       'last_review', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m']].describe()

Unnamed: 0,price,min_nights,num_reviews,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m
count,26256.0,26256.0,26256.0,22799.0,26256.0,26256.0,26256.0
mean,230.828344,3.455629,53.935977,1.665726,8.712294,181.439328,15.360718
std,1345.561287,18.371505,86.204971,1.672183,27.598178,116.643099,20.701418
min,0.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,85.0,1.0,3.0,0.37,1.0,78.0,0.0
50%,134.0,2.0,18.0,1.11,2.0,191.0,6.0
75%,211.0,3.0,67.0,2.48,6.0,285.0,24.0
max,90963.0,999.0,1525.0,35.94,248.0,365.0,621.0


####  The minimum price of $0 raises concerns about the presence of missing or erroneous data, as it's unusual for accommodations to be offered for free.

In [389]:
num_zero_prices = (df_rome_june_selected['price'] == 0).sum()
num_zero_prices

5

#### there are only 5 observation, I leave it be, it can also be somekind of coucg surfing!

In [390]:
num_max_nights = (df_rome_june_selected['min_nights'] >= 30).sum()
num_max_nights

354

In [392]:
# Count the number of unique entries and their counts in the 'minimum_nights' column
minimum_nights_counts = df_rome_june_selected['min_nights'].value_counts()

# Count the number of unique entries in the 'minimum_nights' column
num_unique_min_nights = len(minimum_nights_counts)

print("Number of unique entries in the 'minimum_nights' column:", num_unique_min_nights)
print("Counts of each unique entry in the 'minimum_nights' column:")
print(minimum_nights_counts)

Number of unique entries in the 'minimum_nights' column: 54
Counts of each unique entry in the 'minimum_nights' column:
min_nights
1      9771
2      8256
3      5491
4       956
5       524
7       382
30      187
6       151
10       87
15       79
14       57
28       48
90       40
60       33
20       27
365      22
25       13
29       13
8        12
21       11
31       11
150       9
12        7
180       6
13        5
50        5
100       5
120       4
18        4
16        3
999       3
364       3
56        3
300       2
45        2
360       2
99        2
9         2
200       2
55        2
350       1
32        1
40        1
19        1
75        1
400       1
27        1
170       1
85        1
153       1
720       1
80        1
140       1
11        1
Name: count, dtype: int64


#### There are some unusual values in the data, such as:
#### gap between the mean and median prices. I am going to investigate this by examining the number of values above the third quartile.(outliers)
#### mean 230.828344
#### median 134.000000

#### listings where the minimum stay time is 720 or 999 days. I want to investigate these entries.
#### I want to see the listings with a minimum stay time of more than 336 nights. Renting a fully equipped flat or room for a year through Airbnb isn't unusual, but for periods longer than a year, people typically prefer to rent from agencies with different types of contracts.

In [393]:
# check the 'price' column
columns_to_display = ['neighborhood',
       'room_type', 'price', 'min_nights', 'num_reviews',
       'last_review', 'reviews_per_month','calculated_host_listings_count','availability_365', 'num_reviews_last_12m']

mean = 230.828344
std_dev = 1345.561287

lower_bound = mean - 3 * std_dev
upper_bound = mean + 3 * std_dev

lower_bound, upper_bound
outliers = df_rome_june_selected[(df_rome_june_selected['price'] < lower_bound) | (df_rome_june_selected['price'] > upper_bound)]
outliers.shape

(73, 15)

#### we only have 73 outliers an it is aroun 0.2% of whole observation, I decide to delete them.

In [394]:
# Filter the DataFrame to exclude outliers and overwrite the original DataFrame
df_rome_june_selected = df_rome_june_selected[
(df_rome_june_selected['price'] >= lower_bound) & (df_rome_june_selected['price'] <= upper_bound)]

In [395]:
# Filter the DataFrame to exclude outliers and overwrite the original DataFrame
df_rome_june_selected.shape

(26183, 15)

#### Previously we had 26256 observations and now there are 26183, and the 73 outliers are successfully removed.

In [396]:
df_rome_june_selected[['price', 'min_nights', 'num_reviews',
       'last_review', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m']].describe()

Unnamed: 0,price,min_nights,num_reviews,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m
count,26183.0,26183.0,26183.0,22751.0,26183.0,26183.0,26183.0
mean,189.927472,3.431043,54.056602,1.668243,8.708551,181.621777,15.397548
std,230.562618,18.123463,86.288568,1.672789,27.555454,116.552622,20.716457
min,0.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,85.0,1.0,3.0,0.37,1.0,78.0,0.0
50%,133.0,2.0,18.0,1.12,2.0,191.0,6.0
75%,210.0,3.0,67.0,2.49,6.0,285.0,24.0
max,4255.0,999.0,1525.0,35.94,248.0,365.0,621.0


#### The same problem is seen here too in 'calculated_host_listings_count', mean is 8.7 and the median is 2, and the max is 248! I need to find the outliers and the number of them.

In [397]:
# check the 'calculated_host_listings_count' column
columns_to_display = ['neighborhood',
       'room_type', 'price', 'min_nights', 'num_reviews',
       'last_review', 'reviews_per_month','calculated_host_listings_count','availability_365', 'num_reviews_last_12m']

mean_chlc = 8.708551
std_dev_chlc = 27.555454	

lower_bound_chlc = mean_chlc - 3 * std_dev_chlc
upper_bound_chlc = mean_chlc + 3 * std_dev_chlc

lower_bound_chlc, upper_bound_chlc
outliers_chlc = df_rome_june_selected[(df_rome_june_selected['calculated_host_listings_count'] < lower_bound_chlc)
| (df_rome_june_selected['calculated_host_listings_count'] > upper_bound_chlc)]
outliers_chlc.shape

(402, 15)

#### There are 402 outliers which are 1.5% of whole data, so I remove them as well.

In [398]:
# Filter the DataFrame to exclude outliers and overwrite the original DataFrame
df_rome_june_selected = df_rome_june_selected[
(df_rome_june_selected['calculated_host_listings_count'] >= lower_bound_chlc) & 
(df_rome_june_selected['calculated_host_listings_count'] <= upper_bound_chlc)]

In [399]:
df_rome_june_selected.shape

(25781, 15)

#### Now we have 25781 observations, which is 402 less than 25183 observations before, so the outliers are removed.

In [401]:
df_rome_june_selected[['price', 'min_nights', 'num_reviews',
       'last_review', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m']].describe()

Unnamed: 0,price,min_nights,num_reviews,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m
count,25781.0,25781.0,25781.0,22367.0,25781.0,25781.0,25781.0
mean,187.958691,3.456964,54.315969,1.667172,5.533998,182.874636,15.389473
std,229.156222,18.262929,86.792153,1.679109,9.131858,116.437846,20.794682
min,0.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,85.0,1.0,3.0,0.37,1.0,79.0,0.0
50%,132.0,2.0,17.0,1.11,2.0,194.0,6.0
75%,209.0,3.0,68.0,2.49,6.0,286.0,24.0
max,4255.0,999.0,1525.0,35.94,68.0,365.0,621.0


In [402]:
# check the 'minimum_nights' column
columns_to_display = ['neighborhood',
       'room_type', 'price', 'min_nights', 'numb_reviews',
       'last_review', 'reviews_per_month','calculated_host_listings_count','availability_365', 'num_reviews_last_12m']

mean_mn = 3.456964
std_dev_mn = 18.262929		

lower_bound_mn = mean_mn - 3 * std_dev_mn
upper_bound_mn = mean_mn + 3 * std_dev_mn

lower_bound_mn, upper_bound_mn
outliers_mn = df_rome_june_selected[(df_rome_june_selected['min_nights'] < lower_bound_mn)
| (df_rome_june_selected['min_nights'] > upper_bound_mn)]
outliers_mn.shape

(140, 15)

#### There are 140 outliers in 'minimum_nights', which is 0.5% of the current dataframe, so I decide to remove them as well.

In [403]:
# Filter the DataFrame to exclude outliers and overwrite the original DataFrame
df_rome_june_selected = df_rome_june_selected[
(df_rome_june_selected['min_nights'] >= lower_bound_mn) & 
(df_rome_june_selected['min_nights'] <= upper_bound_mn)]

In [404]:
df_rome_june_selected.shape

(25641, 15)

#### Now we have 25641 observations, which is 140 less than 25781 observations before, so the outliers are removed.

In [405]:
df_rome_june_selected[['price', 'min_nights', 'num_reviews',
       'last_review', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m']].describe()

Unnamed: 0,price,min_nights,num_reviews,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m
count,25641.0,25641.0,25641.0,22280.0,25641.0,25641.0,25641.0
mean,188.241839,2.52147,54.494403,1.672062,5.547249,182.904411,15.471042
std,228.97318,3.469942,86.928632,1.680252,9.149864,116.382652,20.821117
min,0.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,85.0,1.0,3.0,0.37,1.0,79.0,0.0
50%,133.0,2.0,18.0,1.11,2.0,194.0,6.0
75%,209.0,3.0,68.0,2.5,6.0,286.0,24.0
max,4255.0,56.0,1525.0,35.94,68.0,365.0,621.0


In [406]:
# Count the number of unique values in the 'neighborhood' column
unique_neighborhoods_count = df_rome_june_selected['neighborhood'].nunique()

# Display the number of unique values
unique_neighborhoods_count

15

In [407]:
df_rome_june_selected.head()

Unnamed: 0,id,name,host_id,neighborhood,latitude,longitude,room_type,price,min_nights,num_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m
0,2737,Place to stay in Rome · ★4.80 · 1 bedroom · 1 ...,3047,VIII Appia Antica,41.87136,12.48215,Private room,50,7,7,2015-05-28 00:00:00,0.07,6,358,0
1,181189,Bed and breakfast in Rome · ★4.72 · 1 bedroom ...,868477,I Centro Storico,41.90963,12.45603,Hotel room,100,3,147,2023-06-05 00:00:00,1.03,8,175,6
2,11834,Rental unit in Rome · ★4.80 · 1 bedroom · 1 be...,44552,I Centro Storico,41.895447,12.491181,Entire home/apt,105,2,191,2023-06-02 00:00:00,1.3,1,246,44
3,49240,Condo in Rome · ★4.69 · 2 bedrooms · 5 beds · ...,224479,I Centro Storico,41.89738,12.476,Entire home/apt,294,2,70,2023-04-11 00:00:00,0.46,2,151,32
4,181747,Rental unit in Rome · ★4.56 · 2 bedrooms · 3 b...,871063,I Centro Storico,41.90183,12.50397,Entire home/apt,228,2,85,2023-06-09 00:00:00,0.63,2,329,42


In [408]:
# split the name column
# Define the function to extract the components from the 'name' column
def split_name_column(name):
    parts = name.split(' · ')
    
    # Ensure the parts list has the expected number of elements
    if len(parts) == 5:
        place_name = parts[0]
        rating = parts[1]
        num_bedroom = parts[2]
        num_bed = parts[3]
        baths = parts[4]
    else:
        place_name, rating, num_bedroom, num_bed, baths = [None]*5
    
    return place_name, rating, num_bedroom, num_bed, baths

# Apply the function to the 'name' column and create new columns
df_rome_june_selected[['place_name', 'rating', 'num_bedroom', 'num_bed', 'baths']] = df_rome_june_selected['name'].apply(lambda x: pd.Series(split_name_column(x)))

# Drop the original 'name' column
df_rome_june_selected.drop(columns=['name'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rome_june_selected[['place_name', 'rating', 'num_bedroom', 'num_bed', 'baths']] = df_rome_june_selected['name'].apply(lambda x: pd.Series(split_name_column(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rome_june_selected[['place_name', 'rating', 'num_bedroom', 'num_bed', 'baths']] = df_rome_june_selected['name'].apply(lambda x: pd.Series(split_name_column(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

In [410]:
df_rome_june_selected.head()

Unnamed: 0,id,host_id,neighborhood,latitude,longitude,room_type,price,min_nights,num_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m,place_name,rating,num_bedroom,num_bed,baths
0,2737,3047,VIII Appia Antica,41.87136,12.48215,Private room,50,7,7,2015-05-28 00:00:00,0.07,6,358,0,Place to stay in Rome,★4.80,1 bedroom,1 bed,1.5 baths
1,181189,868477,I Centro Storico,41.90963,12.45603,Hotel room,100,3,147,2023-06-05 00:00:00,1.03,8,175,6,Bed and breakfast in Rome,★4.72,1 bedroom,1 bed,1 private bath
2,11834,44552,I Centro Storico,41.895447,12.491181,Entire home/apt,105,2,191,2023-06-02 00:00:00,1.3,1,246,44,Rental unit in Rome,★4.80,1 bedroom,1 bed,1 bath
3,49240,224479,I Centro Storico,41.89738,12.476,Entire home/apt,294,2,70,2023-04-11 00:00:00,0.46,2,151,32,Condo in Rome,★4.69,2 bedrooms,5 beds,1.5 baths
4,181747,871063,I Centro Storico,41.90183,12.50397,Entire home/apt,228,2,85,2023-06-09 00:00:00,0.63,2,329,42,Rental unit in Rome,★4.56,2 bedrooms,3 beds,2 baths


In [411]:
desired_column_order = [ 'id', 'host_id', 'place_name', 'rating', 'num_bedroom', 'num_bed', 'baths',
       'neighborhood', 'latitude', 'longitude', 'room_type', 'price',
       'min_nights', 'num_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'num_reviews_last_12m']

In [412]:
df_rome_june_selected = df_rome_june_selected[desired_column_order]

In [413]:
df_rome_june_selected.head()

Unnamed: 0,id,host_id,place_name,rating,num_bedroom,num_bed,baths,neighborhood,latitude,longitude,room_type,price,min_nights,num_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,num_reviews_last_12m
0,2737,3047,Place to stay in Rome,★4.80,1 bedroom,1 bed,1.5 baths,VIII Appia Antica,41.87136,12.48215,Private room,50,7,7,2015-05-28 00:00:00,0.07,6,358,0
1,181189,868477,Bed and breakfast in Rome,★4.72,1 bedroom,1 bed,1 private bath,I Centro Storico,41.90963,12.45603,Hotel room,100,3,147,2023-06-05 00:00:00,1.03,8,175,6
2,11834,44552,Rental unit in Rome,★4.80,1 bedroom,1 bed,1 bath,I Centro Storico,41.895447,12.491181,Entire home/apt,105,2,191,2023-06-02 00:00:00,1.3,1,246,44
3,49240,224479,Condo in Rome,★4.69,2 bedrooms,5 beds,1.5 baths,I Centro Storico,41.89738,12.476,Entire home/apt,294,2,70,2023-04-11 00:00:00,0.46,2,151,32
4,181747,871063,Rental unit in Rome,★4.56,2 bedrooms,3 beds,2 baths,I Centro Storico,41.90183,12.50397,Entire home/apt,228,2,85,2023-06-09 00:00:00,0.63,2,329,42


# Expert

In [414]:
# export dataframe:
df_rome_june_selected.to_csv(os.path.join(path,'Prepared', 'rome_listings_june_cleaned.csv'))