# Data Cleaning 

In [1]:
import pandas as pd
import re

In [2]:
hotels_df = pd.read_csv('../../data/raw/hotels.csv')
hotels_df.head()

Unnamed: 0,listing_name,amenities,rate,price,listing_link
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast includedPool,"8.2/10Very good(1,219 reviews)",Price is CA $180 per nightCA $180per nightCA $...,ca.hotels.com/ho1177609504/holiday-inn-express...
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,8.6/10Excellent(176 reviews),Price is CA $152 per nightCA $152per nightCA $...,ca.hotels.com/ho2321835040/microtel-inn-suites...
2,Lord Elgin Hotel,PoolHot tub,"8.6/10Excellent(1,031 reviews)","12% offPrice was CA $339, price is now CA $298...",ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast includedPoolHot tub,"9.4/10Exceptional(1,120 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.com/ho763070976/fairfield-inn-suites...
4,Welcominns Hotel Ottawa,Breakfast included,"7.8/10Good(1,024 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.comhttps://ca.hotels.com/user/signin...


In [48]:
# Function to insert space before upper case 

def add_space_before_uppercase(word):
    if isinstance(word, str):
        pattern = r'([a-z0-9])([A-Z])'
        result = re.sub(pattern, r'\1 \2', word)
        return result
    else:
        return word


# Apply the function to the 'amenities' column
hotels_df['amenities'] = hotels_df['amenities'].apply(add_space_before_uppercase)

# Apply the function to the 'rate' column
hotels_df['rate'] = hotels_df['rate'].apply(add_space_before_uppercase)

# Print the modified dataframe
hotels_df.head()


Unnamed: 0,listing_name,amenities,rate,price,listing_link
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast included Pool,"8.2/10 Very good(1,219 reviews)",Price is CA $180 per nightCA $180per nightCA $...,ca.hotels.com/ho1177609504/holiday-inn-express...
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,8.6/10 Excellent(176 reviews),Price is CA $152 per nightCA $152per nightCA $...,ca.hotels.com/ho2321835040/microtel-inn-suites...
2,Lord Elgin Hotel,Pool Hot tub,"8.6/10 Excellent(1,031 reviews)","12% offPrice was CA $339, price is now CA $298...",ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast included Pool Hot tub,"9.4/10 Exceptional(1,120 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.com/ho763070976/fairfield-inn-suites...
4,Welcominns Hotel Ottawa,Breakfast included,"7.8/10 Good(1,024 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.comhttps://ca.hotels.com/user/signin...


## Find the nightly price (before taxes & fees)

In [49]:
# Function to extract the nightly price before taxes & fees 

def extract_number_before_pernight(text):
    pattern = r'(\d+)\s*per\s*night\s*CA'
    match = re.search(pattern, text)
    if match:
        return int(match.group(1))
    else:
        return None


# Apply the function to the 'price' column
hotels_df['nightly_price'] = hotels_df['price'].apply(extract_number_before_pernight)

# Print the modified dataframe
hotels_df.head()


Unnamed: 0,listing_name,amenities,rate,price,listing_link,nightly_price
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast included Pool,"8.2/10 Very good(1,219 reviews)",Price is CA $180 per nightCA $180per nightCA $...,ca.hotels.com/ho1177609504/holiday-inn-express...,180
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,8.6/10 Excellent(176 reviews),Price is CA $152 per nightCA $152per nightCA $...,ca.hotels.com/ho2321835040/microtel-inn-suites...,152
2,Lord Elgin Hotel,Pool Hot tub,"8.6/10 Excellent(1,031 reviews)","12% offPrice was CA $339, price is now CA $298...",ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...,298
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast included Pool Hot tub,"9.4/10 Exceptional(1,120 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.com/ho763070976/fairfield-inn-suites...,169
4,Welcominns Hotel Ottawa,Breakfast included,"7.8/10 Good(1,024 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.comhttps://ca.hotels.com/user/signin...,169


# Find the nightly_price (after taxes & fees)

In [50]:
# Function to extract the nightly price after taxes & fees 

def extract_number_before_total_includes(text):
    pattern = r'(\d+)\s*totalIncludes'
    match = re.search(pattern, text)
    if match:
        return int(match.group(1))
    else:
        return None
    
# Apply the function to the 'price' column
hotels_df['taxable_price'] = hotels_df['price'].apply(extract_number_before_total_includes)

# Print the modified dataframe
hotels_df.head()

Unnamed: 0,listing_name,amenities,rate,price,listing_link,nightly_price,taxable_price
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast included Pool,"8.2/10 Very good(1,219 reviews)",Price is CA $180 per nightCA $180per nightCA $...,ca.hotels.com/ho1177609504/holiday-inn-express...,180,212
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,8.6/10 Excellent(176 reviews),Price is CA $152 per nightCA $152per nightCA $...,ca.hotels.com/ho2321835040/microtel-inn-suites...,152,179
2,Lord Elgin Hotel,Pool Hot tub,"8.6/10 Excellent(1,031 reviews)","12% offPrice was CA $339, price is now CA $298...",ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...,298,351
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast included Pool Hot tub,"9.4/10 Exceptional(1,120 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.com/ho763070976/fairfield-inn-suites...,169,199
4,Welcominns Hotel Ottawa,Breakfast included,"7.8/10 Good(1,024 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.comhttps://ca.hotels.com/user/signin...,169,199


# Extract rating over 10


In [51]:
# Function to extract rating over 10 from rate column 

def extract_number_before_rating(text):
    if isinstance(text, str):
        pattern = r'(\d+(?:\.\d+)?)\/10'
        match = re.search(pattern, text)
        if match:
            return float(match.group(1))
        else:
            return None
    else:
        return None

# Apply the function to the 'rate' column
hotels_df['rating_10'] = hotels_df['rate'].apply(extract_number_before_rating)

# Print the modified dataframe
hotels_df.head()


Unnamed: 0,listing_name,amenities,rate,price,listing_link,nightly_price,taxable_price,rating_10
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast included Pool,"8.2/10 Very good(1,219 reviews)",Price is CA $180 per nightCA $180per nightCA $...,ca.hotels.com/ho1177609504/holiday-inn-express...,180,212,8.2
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,8.6/10 Excellent(176 reviews),Price is CA $152 per nightCA $152per nightCA $...,ca.hotels.com/ho2321835040/microtel-inn-suites...,152,179,8.6
2,Lord Elgin Hotel,Pool Hot tub,"8.6/10 Excellent(1,031 reviews)","12% offPrice was CA $339, price is now CA $298...",ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...,298,351,8.6
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast included Pool Hot tub,"9.4/10 Exceptional(1,120 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.com/ho763070976/fairfield-inn-suites...,169,199,9.4
4,Welcominns Hotel Ottawa,Breakfast included,"7.8/10 Good(1,024 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.comhttps://ca.hotels.com/user/signin...,169,199,7.8


# Extract the number of reviews 

In [52]:
# Function to extract number of reviews from rate column 

def extract_number_before_reviews(text):
    if isinstance(text, str):
        pattern = pattern = r'(\d+(?:,\d+)?) reviews'
        match = re.search(pattern, text)
        if match:
            number = match.group(1).replace(',', '')
            return int(number)
        else:
            return None
    else:
        return None

# Apply the function to the 'rate' column
hotels_df['nb_reviews'] = hotels_df['rate'].apply(extract_number_before_reviews)

# Print the modified dataframe
hotels_df.head()

Unnamed: 0,listing_name,amenities,rate,price,listing_link,nightly_price,taxable_price,rating_10,nb_reviews
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast included Pool,"8.2/10 Very good(1,219 reviews)",Price is CA $180 per nightCA $180per nightCA $...,ca.hotels.com/ho1177609504/holiday-inn-express...,180,212,8.2,1219.0
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,8.6/10 Excellent(176 reviews),Price is CA $152 per nightCA $152per nightCA $...,ca.hotels.com/ho2321835040/microtel-inn-suites...,152,179,8.6,176.0
2,Lord Elgin Hotel,Pool Hot tub,"8.6/10 Excellent(1,031 reviews)","12% offPrice was CA $339, price is now CA $298...",ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...,298,351,8.6,1031.0
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast included Pool Hot tub,"9.4/10 Exceptional(1,120 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.com/ho763070976/fairfield-inn-suites...,169,199,9.4,1120.0
4,Welcominns Hotel Ottawa,Breakfast included,"7.8/10 Good(1,024 reviews)",Price is CA $169 per nightCA $169per nightCA $...,ca.hotels.comhttps://ca.hotels.com/user/signin...,169,199,7.8,1024.0


In [53]:
# Drop rate and price column 
columns_to_drop = ['rate', 'price']
hotels_df = hotels_df.drop(columns_to_drop, axis=1)

# Print the modified dataframe
hotels_df.head()

Unnamed: 0,listing_name,amenities,listing_link,nightly_price,taxable_price,rating_10,nb_reviews
0,Holiday Inn Express & Suites Ottawa Downtown E...,Breakfast included Pool,ca.hotels.com/ho1177609504/holiday-inn-express...,180,212,8.2,1219.0
1,Microtel Inn & Suites by Wyndham Kanata Ottawa...,Breakfast included,ca.hotels.com/ho2321835040/microtel-inn-suites...,152,179,8.6,176.0
2,Lord Elgin Hotel,Pool Hot tub,ca.hotels.com/ho111693/lord-elgin-hotel-ottawa...,298,351,8.6,1031.0
3,Fairfield Inn & Suites by Marriott Ottawa Airport,Breakfast included Pool Hot tub,ca.hotels.com/ho763070976/fairfield-inn-suites...,169,199,9.4,1120.0
4,Welcominns Hotel Ottawa,Breakfast included,ca.hotels.comhttps://ca.hotels.com/user/signin...,169,199,7.8,1024.0


In [54]:
# Save cleaned dataframe as CSV file
hotels_df.to_csv('../clean_data/clean_hotels.csv',index=False)