In [37]:
import pandas as pd
import numpy as np
import re
# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)

In [38]:
df = pd.read_csv('../data/listing_data_20240325_152500.csv', index_col=0)

In [39]:
# Helper function to extract numbers from text
def extract_number(text):
    """
    Extract numerical value from text using regular expression.
    
    Parameters:
    text (str): Text containing numerical value.
    
    Returns:
    int or None: Extracted numerical value or None if no match.
    """
    match = re.search(r'\d+', str(text))
    return int(match.group()) if match else None

def post_proc(df):
    """
    Post-processing function to clean and transform the input dataframe.
    
    Parameters:
    df (DataFrame): Input dataframe containing raw data.
    
    Returns:
    DataFrame: Processed dataframe ready for modeling.
    """
    
    # Price: remove currency symbol and convert to numeric
    df['Price'] = df['Price'].str.replace('€', '').str.strip().astype(float)
    df['Host Name'] = df['Host Name'].str.replace('Hosted by ', '')

    # Extract numerical values from text columns
    df['Visitors'] = df['Visitors'].apply(extract_number)
    df['Beds'] = df['Beds'].apply(extract_number)
    df['Bedrooms'] = df['Bedrooms'].apply(extract_number)
    df['Baths'] = df['Baths'].apply(extract_number)
    df['Number of reviews'] = df['Number of reviews'].apply(extract_number)

    # Convert Review Index to float
    df['Review Index'] = df['Review Index'].apply(extract_number).astype(float)

    # Convert categorical columns to binary
    df['Guest Favorite'] = df['Guest Favorite'].astype(str).apply(lambda x: 1 if 'favorite' in x else 0)
    df['Superhost'] = df['Superhost'].astype(str).apply(lambda x: 1 if 'Superhost' in x else 0)

    # Assuming Latitude and Longitude are already in numeric format
    # If not, convert them to numeric here

    # Characteristics processing
    characteristics_to_track = ['Superhost', 'Free cancellation', 'Fast wifi', 'Dedicated workspace', 'Great location', 'Furry friends', 'Highly rated', 'Self check-in', 'Great check-in', 'remote work']

# Create new columns for each characteristic and set binary values
    for char in characteristics_to_track:
        # Check for NaN values in 'Characteristics' column
        if not df['Characteristics'].isna().all():
            df['char_' + char.lower().replace(' ', '_')] = df['Characteristics'].str.contains(char, na=False).astype(int)
        else:
            df['char_' + char.lower().replace(' ', '_')] = 0

    # Drop the original 'Characteristics' column
    df.drop('Characteristics', axis=1, inplace=True)

    return df

In [40]:
df.head()

Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Characteristics,Latitude,Longitude
0,€ 95,Suite Blanche by Aristotelous Square,4 guests ·,· 3 beds ·,· 1 bedroom ·,· 1 bath,Guest\nfavorite,,5.0,6,Hosted by Evripidis,"Fast wifi, Self check-in, Free cancellation be...",40.6324,22.942
1,€ 359,F & B Collection - Luxury Seafront 2 Bedroom Flat,4 guests ·,· 2 beds ·,· 2 bedrooms ·,· 2 baths,Guest\nfavorite,,5.0,6,Hosted by Dimitris,"Great location, Great check-in experience, Exp...",40.629144,22.945721
2,€ 64,Mavili#19 Apartment for 4- Thessaloniki Center,4 guests ·,· 3 beds ·,· 2 bedrooms ·,· 1 bath,,,New,1 review,Hosted by Anastasia,,40.642218,22.93765
3,€ 42,Aegean Blue Apartments 3חי Luxury Suite 1 BDR 1+1,4 guests ·,· 2 beds ·,· 1 bedroom ·,· 1 bath,Guest\nfavorite,Superhost ·,4.94,17,Hosted by Aegean Blue,"Self check-in, Aegean Blue is a Superhost, Gre...",40.6414,22.9394
4,€ 65,TOP location Brand NEW apartment !,4 guests ·,· 2 beds ·,· 2 bedrooms ·,· 1 bath,Guest\nfavorite,Superhost ·,4.87,220,Hosted by Alexandra,"Self check-in, Alexandra is a Superhost, Free ...",40.63452,22.94825


In [41]:
post_df = post_proc(df.copy())

In [42]:
post_df.head()

Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Latitude,Longitude,char_superhost,char_free_cancellation,char_fast_wifi,char_dedicated_workspace,char_great_location,char_furry_friends,char_highly_rated,char_self_check-in,char_great_check-in,char_remote_work
0,95.0,Suite Blanche by Aristotelous Square,4,3.0,1.0,1,1,0,5.0,6.0,Evripidis,40.6324,22.942,0,1,1,0,0,0,0,1,0,0
1,359.0,F & B Collection - Luxury Seafront 2 Bedroom Flat,4,2.0,2.0,2,1,0,5.0,6.0,Dimitris,40.629144,22.945721,0,0,0,0,1,0,0,0,1,0
2,64.0,Mavili#19 Apartment for 4- Thessaloniki Center,4,3.0,2.0,1,0,0,,1.0,Anastasia,40.642218,22.93765,0,0,0,0,0,0,0,0,0,0
3,42.0,Aegean Blue Apartments 3חי Luxury Suite 1 BDR 1+1,4,2.0,1.0,1,1,1,4.0,17.0,Aegean Blue,40.6414,22.9394,1,0,0,0,0,0,0,1,1,0
4,65.0,TOP location Brand NEW apartment !,4,2.0,2.0,1,1,1,4.0,220.0,Alexandra,40.63452,22.94825,1,1,0,0,0,0,0,1,0,0
