In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split


#DISPLAYING THE FEATURES IN A HEATMAP


# Step 1: Load the dataset
df = pd.read_csv("Aemf1.csv")

# Step 2: Drop unnecessary columns
df = df.drop(columns=["Attraction Index", "Restraunt Index", "Private Room"])

# Step 3: Map city names to index
city_mapping = {
    'Berlin': 0,
    'Amsterdam': 1,
    'Vienna': 2,
    'Paris': 3,
    'Budapest': 4,
    'Barcelona': 5,
    'Rome': 6,
    'Lisbon': 7,
    'Athens': 8
}
df["City"] = df["City"].map(city_mapping)

# Step 4: Convert categorical features to binary
df["Room_type"] = df["Room Type"].apply(lambda x: 1 if x == "Private room" else 0)
df["Day"] = df["Day"].apply(lambda x: 1 if x == "Weekday" else 0)
df["Shared_room"] = df["Shared Room"].astype(int)
df["Superhost"] = df["Superhost"].astype(int)

# Step 5: Drop the original textual columns
df = df.drop(columns=["Room Type", "Shared Room"])


In [9]:
# PRICE HISTOGRAM, HIGH PRICES

# Step 1: Select only listings with a price over 2000€

prices2 = df[(df['Price'] > 2000)]['Price']



In [10]:
#DATASET CLEANUP

# Step 1: Clean the data
df = df[df['Price'] <= 5000] #remove prices over 5000€
df_cleaned = df.dropna() #remove all rows with at least one null value



In [11]:

# CREATING SCATTERPLOTS FROM NON-BINARY DATA

# Step 1: Define binary columns (based on earlier preprocessing)
binary_columns = ['Room_type', 'Day', 'Shared_room', 'Superhost', 'Multiple Rooms', 'Business']

# Step 2: Get all numeric columns
all_numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns.tolist()

# Step 3: Select numeric non-binary features (exclude target and binaries)
features_to_plot = [col for col in all_numeric_columns if col not in binary_columns + ['Price']]



In [12]:
# Features = all except Price
X = df_cleaned.drop(columns=['Price'])

# Label = Price (continuous)
y = df_cleaned['Price']

# Split Train (75%) vs Temp (25%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.25, random_state=42, shuffle=True
)

# Split Temp (25%) into Validation (15%) and Test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.4, random_state=42, shuffle=True
)

print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)


Train set: (31275, 15) (31275,)
Validation set: (6255, 15) (6255,)
Test set: (4170, 15) (4170,)
