In [1]:
import pandas as pd
import numpy as np
import re
# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/listing_data_20240325_181147.csv', index_col=0)

In [3]:
# Helper function to extract numbers from text
def extract_number(text):
    """
    Extract numerical value from text using regular expression.
    
    Parameters:
    text (str): Text containing numerical value.
    
    Returns:
    int or None: Extracted numerical value or None if no match.
    """
    match = re.search(r'\d+', str(text))
    return int(match.group()) if match else None

def post_proc(df):
    """
    Post-processing function to clean and transform the input dataframe.
    
    Parameters:
    df (DataFrame): Input dataframe containing raw data.
    
    Returns:
    DataFrame: Processed dataframe ready for modeling.
    """
    
    # Price: remove currency symbol and convert to numeric
    df['Price'] = df['Price'].str.replace('€', '').str.strip().astype(float)
    df['Host Name'] = df['Host Name'].str.replace('Hosted by ', '')

    # Extract numerical values from text columns
    df['Visitors'] = df['Visitors'].apply(extract_number)
    df['Beds'] = df['Beds'].apply(extract_number)
    df['Bedrooms'] = df['Bedrooms'].apply(extract_number)
    df['Baths'] = df['Baths'].apply(extract_number)
    df['Number of reviews'] = df['Number of reviews'].apply(extract_number)

    # Convert Review Index to float
    df['Review Index'] = df['Review Index'].apply(extract_number).astype(float)

    # Convert categorical columns to binary
    df['Guest Favorite'] = df['Guest Favorite'].astype(str).apply(lambda x: 1 if 'favorite' in x else 0)
    df['Superhost'] = df['Superhost'].astype(str).apply(lambda x: 1 if 'Superhost' in x else 0)

    # Assuming Latitude and Longitude are already in numeric format
    # If not, convert them to numeric here

    # Characteristics processing
    characteristics_to_track = ['Superhost', 'Free cancellation', 'Fast wifi', 'Dedicated workspace', 'Great location', 'Furry friends', 'Highly rated', 'Self check-in', 'Great check-in', 'remote work']

# Create new columns for each characteristic and set binary values
    for char in characteristics_to_track:
        # Check for NaN values in 'Characteristics' column
        if not df['Characteristics'].isna().all():
            df['char_' + char.lower().replace(' ', '_')] = df['Characteristics'].str.contains(char, na=False).astype(int)
        else:
            df['char_' + char.lower().replace(' ', '_')] = 0

    # Drop the original 'Characteristics' column
    df.drop('Characteristics', axis=1, inplace=True)

    return df

In [4]:
df.head()

Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Characteristics,Latitude,Longitude
0,€ 65,DoorMat#13 Black Mirror#Jacuzzi,2 guests ·,· 1 bed ·,· 1 bedroom ·,· 1 bath,,,4.65,20 reviews,Hosted by Mario,"Self check-in, Great location, Great check-in ...",40.6282,22.9523
1,€ 55,"Elegant Suite In The Center, The Luxury Suites",4 guests ·,· 2 beds ·,· 1 bedroom ·,· 1 bath,,,4.56,81 reviews,Hosted by Dimitris,"Self check-in, Great check-in experience, Free...",40.63865,22.9376
2,€ 29,"Waterfront #11Design- CozyCityCenter ""Try It T...",2 guests ·,· 1 bed ·,· 1 bedroom ·,· 1 bath,,Superhost ·,4.7,551 reviews,Hosted by Alexandra,"Self check-in, Alexandra is a Superhost, Free ...",40.6318,22.9417
3,€ 40,Innovative Minimal Fresh suite in Ladadika,4 guests ·,· 1 bed ·,· 1 bedroom ·,· 1 bath,,Superhost ·,4.89,101 reviews,Hosted by Innovation,"Self check-in, Innovation is a Superhost, Free...",40.63672,22.93779
4,€ 57,* CK Modern Loft Near Seaside *,4 guests ·,· 2 beds ·,· 1 bedroom ·,· 1 bath,Guest\nfavorite,,5.0,18,Hosted by Κωνσταντίνος,"Self check-in, Great check-in experience, High...",40.6053,22.9537


In [5]:
post_df = post_proc(df.copy())

In [6]:
post_df.head()

Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Latitude,Longitude,char_superhost,char_free_cancellation,char_fast_wifi,char_dedicated_workspace,char_great_location,char_furry_friends,char_highly_rated,char_self_check-in,char_great_check-in,char_remote_work
0,65.0,DoorMat#13 Black Mirror#Jacuzzi,2.0,1.0,1.0,1.0,0,0,4.0,20.0,Mario,40.6282,22.9523,0,0,0,0,1,0,0,1,1,0
1,55.0,"Elegant Suite In The Center, The Luxury Suites",4.0,2.0,1.0,1.0,0,0,4.0,81.0,Dimitris,40.63865,22.9376,0,1,0,0,0,0,0,1,1,0
2,29.0,"Waterfront #11Design- CozyCityCenter ""Try It T...",2.0,1.0,1.0,1.0,0,1,4.0,551.0,Alexandra,40.6318,22.9417,1,1,0,0,0,0,0,1,0,0
3,40.0,Innovative Minimal Fresh suite in Ladadika,4.0,1.0,1.0,1.0,0,1,4.0,101.0,Innovation,40.63672,22.93779,1,1,0,0,0,0,0,1,0,0
4,57.0,* CK Modern Loft Near Seaside *,4.0,2.0,1.0,1.0,1,0,5.0,18.0,Κωνσταντίνος,40.6053,22.9537,0,0,0,0,0,0,1,1,1,0


### Geographical Data filtering

In [106]:
import osmnx as ox
from shapely.geometry import Polygon
from shapely.geometry import Point

In [161]:
# area = "kordelio - Evosmos Municipality"
# area = "Ampelokipi - Menemeni Municipality"
# area = "Stavroupoli Municipal Unit, Thessaloniki"

areas = ["kordelio - Evosmos Municipality", "Ampelokipi - Menemeni Municipality", "Stavroupoli Municipal Unit, Thessaloniki"]


def is_within_city(row, city_polygon):
    """
    Checks if a geographical point lies within a city polygon.

    Args:
        row (pandas.Series): A row from a DataFrame containing 'Longitude' and 'Latitude' columns.
        city_polygon (shapely.geometry.Polygon): A Shapely Polygon representing the city boundary.

    Returns:
        bool: True if the point is within the city polygon, False otherwise.
    """

    point = Point(row['Longitude'], row['Latitude'])
    return city_polygon.geometry.contains(point) 

def filter_loc(df, areas):
    """
    Filters a DataFrame of geographical points based on containment within a set of city areas. 

    Args:
        df (pandas.DataFrame): A DataFrame containing 'Longitude' and 'Latitude' columns.
        areas (list): A list of city names as strings.

    Returns:
        pandas.DataFrame: A filtered DataFrame containing only points within all specified city areas.
    """

    union_polygon = None  # Initialize union polygon

    for num, area in enumerate(areas):
        area_polygon = ox.geocode_to_gdf(area)['geometry']

        if num == 0:
            union_polygon = area_polygon  
        else:
            union_polygon = union_polygon.union(area_polygon) 

    mask_df = df.apply(is_within_city, args=(union_polygon,), axis=1).all(axis=1)

    print(f"Filtered out {(~mask_df).sum()} rows from dataframe, due to out of border coordinates...")

    df_filtered = df[mask_df]

    return df_filtered

In [162]:
filter_loc(df,areas)

Filtered out 521 rows from dataframe, due to out of border coordinates...


Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Characteristics,Latitude,Longitude
64,€ 21,Sweet Little House,2 guests ·,· 1 bed ·,· 1 bedroom ·,· 1.5 baths,,Superhost ·,4.94,123 reviews,Hosted by Theodoros,"Fast wifi, Self check-in, Theodoros is a Super...",40.66211,22.91335


In [160]:
(~df.apply(is_within_city, args=(union_polygon,), axis=1).all(axis=1)).sum()

521

In [163]:
len(df) - len(df.drop_duplicates())

304

In [164]:
len(df)

522