In [7]:
import pandas as pd
import numpy as np
import re
# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/listing_data_20240325_142240.csv', index_col=0)

In [9]:
# Helper function to extract numbers from text
def extract_number(text):
    """
    Extract numerical value from text using regular expression.
    
    Parameters:
    text (str): Text containing numerical value.
    
    Returns:
    int or None: Extracted numerical value or None if no match.
    """
    match = re.search(r'\d+', str(text))
    return int(match.group()) if match else None

def post_proc(df):
    """
    Post-processing function to clean and transform the input dataframe.
    
    Parameters:
    df (DataFrame): Input dataframe containing raw data.
    
    Returns:
    DataFrame: Processed dataframe ready for modeling.
    """
    
    # Price: remove currency symbol and convert to numeric
    df['Price'] = df['Price'].str.replace('€', '').str.strip().astype(float)
    df['Host Name'] = df['Host Name'].str.replace('Hosted by ', '')

    # Extract numerical values from text columns
    df['Visitors'] = df['Visitors'].apply(extract_number)
    df['Beds'] = df['Beds'].apply(extract_number)
    df['Bedrooms'] = df['Bedrooms'].apply(extract_number)
    df['Baths'] = df['Baths'].apply(extract_number)
    df['Number of reviews'] = df['Number of reviews'].apply(extract_number)

    # Convert Review Index to float
    df['Review Index'] = df['Review Index'].apply(extract_number).astype(float)

    # Convert categorical columns to binary
    df['Guest Favorite'] = df['Guest Favorite'].apply(lambda x: 1 if x == 'Guest favorite' else 0)
    df['Superhost'] = df['Superhost'].apply(lambda x: 1 if x == 'Superhost' else 0)

    # Assuming Latitude and Longitude are already in numeric format
    # If not, convert them to numeric here

    # Characteristics processing
    characteristics_to_track = ['Superhost', 'Free cancellation', 'Fast wifi', 'Dedicated workspace', 'Great location', 'Furry friends', 'Highly rated', 'Self check-in', 'Great check-in']

    # Create new columns for each characteristic and set binary values
    for char in characteristics_to_track:
        df['char_' + char.lower().replace(' ', '_')] = df['Characteristics'].str.contains(char).astype(int)

    # Drop the original 'Characteristics' column
    df.drop('Characteristics', axis=1, inplace=True)

    return df

In [4]:
df.head()

Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Characteristics,Latitude,Longitude
0,€ 32,Waterfront #8Design - Bubblegum Box,2 guests ·,· 1 bed ·,· 1 bedroom ·,· 1 bath,,Superhost ·,4.73,739 reviews,Hosted by Alexandra,"Self check-in, Alexandra is a Superhost, Free ...",40.6318,22.9417
1,€ 27,Convenience Spot @ Dragoumi,3 guests ·,,· 1 bedroom ·,· 1 bath,,,New,2 reviews,Hosted by Viveta,"Self check-in, Furry friends welcome",40.64075,22.94357
2,€ 40,Innovative Minimal Fresh suite in Ladadika,4 guests ·,· 1 bed ·,· 1 bedroom ·,· 1 bath,,Superhost ·,4.89,101 reviews,Hosted by Innovation,"Self check-in, Innovation is a Superhost, Free...",40.63672,22.93779
3,€ 23,14 Smart tv central cozy room,2 guests ·,· 1 bed ·,· 1 bedroom ·,· 1 bath,,,4.63,112 reviews,Hosted by Βασίλης,"Self check-in, Great location, Free cancellati...",40.60963,22.95493
4,€ 23,#2 City center Ariadni's studio- Huge balcony,2 guests ·,· 2 beds ·,,· 1 bath,Guest\nfavorite,Superhost ·,4.82,205,Hosted by Ariadni,"Ariadni is a Superhost, Great check-in experie...",40.62041,22.95341


In [10]:
post_df = post_proc(df.copy())

In [11]:
post_df.head()

Unnamed: 0,Price,Title,Visitors,Beds,Bedrooms,Baths,Guest Favorite,Superhost,Review Index,Number of reviews,Host Name,Latitude,Longitude,char_superhost,char_free_cancellation,char_fast_wifi,char_dedicated_workspace,char_great_location,char_furry_friends,char_highly_rated,char_self_check-in,char_great_check-in
0,32.0,Waterfront #8Design - Bubblegum Box,2,1.0,1.0,1,0,0,4.0,739.0,Alexandra,40.6318,22.9417,1,1,0,0,0,0,0,1,0
1,27.0,Convenience Spot @ Dragoumi,3,,1.0,1,0,0,,2.0,Viveta,40.64075,22.94357,0,0,0,0,0,1,0,1,0
2,40.0,Innovative Minimal Fresh suite in Ladadika,4,1.0,1.0,1,0,0,4.0,101.0,Innovation,40.63672,22.93779,1,1,0,0,0,0,0,1,0
3,23.0,14 Smart tv central cozy room,2,1.0,1.0,1,0,0,4.0,112.0,Βασίλης,40.60963,22.95493,0,1,0,0,1,0,0,1,0
4,23.0,#2 City center Ariadni's studio- Huge balcony,2,2.0,,1,0,0,4.0,205.0,Ariadni,40.62041,22.95341,1,1,0,0,0,0,0,0,1
