In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(filepath_or_buffer='rome_listing.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
# we can drop columns that are not useful for our analysis
columns_to_drop = [
    "id", "scrape_id", "last_scraped", "source", "name", "picture_url",
    "neighborhood_overview", "picture_url", "host_name", "host_location",
    "host_about", "host_thumbnail_url", "host_picture_url",
    "host_neighbourhood", "host_listings_count", "host_total_listings_count",
    "host_verifications", "host_has_profile_pic", "host_identity_verified",
    "host_url", "host_since", 'host_response_time', 'host_response_rate',
    'host_acceptance_rate', "neighbourhood_group_cleansed", "neighbourhood",
    'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
    'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
    'availability_30', 'availability_60', 'availability_90',
    'availability_365', 'calendar_last_scraped', 'first_review',
    'last_review', 'instant_bookable',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms', 'host_id', "amenities", "description", 'listing_url'
]

df.drop(labels=columns_to_drop, axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.price = df.price.str.replace(pat='$', repl='').str.replace(pat=',', repl='').astype(dtype=float)

In [None]:
# Does License have an impact on the price?

df_l = df[["license", "price"]].copy()
df_l["license"] = df_l["license"].isna()
df_l = df_l[~df_l["price"].isna()]

correlation = df_l["license"].corr(df_l["price"])
sns.heatmap(df_l[["license", "price"]].corr(), annot=True, cmap="coolwarm")
plt.title(f"Correlation between License and Price: {correlation:.2f}")
plt.show()

sns.scatterplot(data=df_l, x='license', y='price')

# remove outliers
df_l = df_l[df_l["price"] < 30000]
sns.scatterplot(data=df_l, x='license', y='price')

plt.savefig("license_price.png")

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display basic information about the dataset
print(df.info())

# Summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

In [None]:
# drop rows with missing values in the price column
df = df.dropna(subset=['price'])
# remove license column
df.drop(columns=['license'], inplace=True)

In [None]:
# We only care if the bathroom is shared or not
df["bathrooms_shared"] = df["bathrooms_text"].str.contains("shared", case=False)
df["bathrooms_shared"] = df["bathrooms_shared"].astype(bool)
# drop the original column
df.drop(columns=['bathrooms_text'], inplace=True)

# convert the host_is_superhost column to a boolean
df["host_is_superhost"] = df["host_is_superhost"].str.contains("t", case=False)
df["host_is_superhost"] = df["host_is_superhost"].astype(bool)

In [None]:
# TODO exploratory data analysis
# TODO feature selection / engineering
# TODO model selection

In [None]:
# # Create a set of all amenities
# df["amenities"] = df["amenities"].str.replace(pat='[{}"]', repl='').str.replace(pat='[",]', repl=' ').str.strip('[]').str.replace('"', '')
# amenities = set()
# for row in df["amenities"]:
#     elements = row.split(',')
#     print(elements)
#     amenities.update(elements)

# len(amenities)

In [None]:
# amenities = set([a.lower() if not 'wifi' in a.lower() else 'wifi' for a in amenities])
# amenities = set([a.lower() if not 'parking' in a.lower() and not 'carport' in a.lower() else 'parking' for a in amenities])
# amenities = set([a.lower() if not 'kitchen' in a.lower() else 'kitchen' for a in amenities])
# amenities = set([a.lower() if not 'washer' in a.lower() else 'washer' for a in amenities])
# amenities = set([a.lower() if not 'dryer' in a.lower() and not 'hair dryer' in a.lower() else 'dryer' for a in amenities])
# amenities = set([a.lower() if not 'tv' in a.lower() else 'tv' for a in amenities])
# amenities = set([a.lower() if not 'heating' in a.lower() else 'heating' for a in amenities])
# amenities = set([a.lower() if not 'air conditioning' in a.lower() else 'air conditioning' for a in amenities])
# amenities = set([a.lower() if not 'refrigerator' in a.lower() else 'refrigerator' for a in amenities])
# amenities = set([a.lower() if not 'microwave' in a.lower() else 'microwave' for a in amenities])
# amenities = set([a.lower() if not 'oven' in a.lower() else 'oven' for a in amenities])
# amenities = set([a.lower() if not 'coffee maker' in a.lower() and not 'coffee machine' in a.lower() else 'coffee maker' for a in amenities])
# amenities = set([a.lower() if not any (x in a.lower() for x in ['body soap', 'bagnoschiuma', 'saponi', 'nivea', 'detergenti mani', 'bagnoschiuma', 'shampoo', 'soap', 'sapone']) else 'soap' for a in amenities])
# amenities = set([a.lower() if not 'sound system' in a.lower() else 'sound system' for a in amenities])
# amenities = set([a.lower() if not 'stove' in a.lower() else 'stove' for a in amenities])
# amenities = set([a.lower() if not 'air conditioning' in a.lower() and not 'airconditioning' in a.lower() and not 'ac - split type ductless system' in a.lower()  and not 'ceiling fan' in a.lower() else 'stove' for a in amenities])
# amenities = set([a.lower() if not 'conditioner' in a.lower() else 'conditioner' for a in amenities])
# amenities = set([a.lower() if not 'high chair' in a.lower() else 'high chair' for a in amenities])
# amenities = set([a.lower() if not 'grill' in a.lower() else 'grill' for a in amenities])
# amenities = set([a.lower() if not 'pool' in a.lower() else 'pool' for a in amenities])
# amenities = set([a.lower() if not 'baby bath' in a.lower() else 'pool' for a in amenities])
# amenities = set([a.lower() if not 'crib' in a.lower() else 'crib' for a in amenities])
# amenities = set([a.lower() if not 'garden' in a.lower() else 'garden' for a in amenities])
# amenities = set([a.lower() if not 'patio' in a.lower() else 'patio' for a in amenities])
# amenities = set([a.lower() if not 'fireplace' in a.lower() else 'fireplace' for a in amenities])
# amenities = set([a.lower() if not 'hangers' in a.lower() else 'hangers' for a in amenities])
# amenities = set([a.lower() if not 'stereo' in a.lower() else 'stereo' for a in amenities])
# amenities = set([a.lower() if not 'stereo' in a.lower() else 'stereo' for a in amenities])
# amenities = set([a.lower() if not any (x in a.lower() for x in ['console', 'xbox', 'playstation', 'game', 'ps3']) else 'console' for a in amenities])
# amenities = set([a.lower() if not 'housekeeping' in a.lower() else 'housekeeping' for a in amenities])
# amenities = set([a.lower() if not 'gym' in a.lower() else 'gym' for a in amenities])
# amenities = set([a.lower() if not 'breakfast' in a.lower() else 'breakfast' for a in amenities])
# amenities = set([a.lower() if not 'baby monitor' in a.lower() else 'baby monitor' for a in amenities])
# amenities = set([a.lower() if not 'books and toys' in a.lower() else 'books and toys' for a in amenities])
# amenities = set([a.lower() if not 'window ac' in a.lower() else 'air conditioning' for a in amenities])
# amenities = set([a.lower() if not 'view' in a.lower() else 'view' for a in amenities])
# amenities = set([a.lower() if not 'exercise equipment' in a.lower() else 'exercise equipment' for a in amenities])
# amenities = set([a.lower() if not 'hot tub' in a.lower() else 'hot tub' for a in amenities])

In [None]:
print(df.isnull().sum())

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
df.to_csv(('rome_listings_cleaned.csv'), index=False)