In [None]:
import pandas as pd
import re

# Load the dataset
try:
    df = pd.read_csv('HOTEL_OUTDATASET.CSV')
except FileNotFoundError:
    print("Error: 'HOTEL_DATASET.CSV' not found. Please make sure the file is in the correct directory.")
    exit()

# --- Step 1: Preprocessing for 'roomtype' column ---
def preprocess_roomtype(roomtype_str):
    if pd.isna(roomtype_str):
        return set() # Return an empty set for missing values
    # 2. Lowercase
    tokens = str(roomtype_str).lower()
    # 3. Replace spaces with underscores
    tokens = tokens.replace(' ', '_')
    # 4. Remove leading/trailing punctuation and split into potential tokens
    potential_tokens = re.split(r'[;,/_\-]+', tokens)

    processed_tokens = set()
    for token in potential_tokens:
        cleaned_token = re.sub(r'^[^\w]+|[^\w]+$', '', token)
        if cleaned_token == 'room':
            pass

        elif cleaned_token == 'double':
            processed_tokens.add('double')

        elif cleaned_token and len(cleaned_token) > 1 and cleaned_token not in ['with', 'only', 'use', 'or', 'of']:
            processed_tokens.add(cleaned_token)

    final_tokens = set()
    has_meaningful_token = False
    for token in processed_tokens:
        if token not in ['with', 'only']: # Re-apply filtering for safety
            if token == 'room':
                if len(processed_tokens) == 1:
                    final_tokens.add('single')
                    has_meaningful_token = True
            elif token == 'double':
                final_tokens.add('double')
                has_meaningful_token = True
            else:
                final_tokens.add(token)
                has_meaningful_token = True

    if not has_meaningful_token and any(t.lower() == 'room' for t in str(roomtype_str).lower().split()):
         final_tokens.add('single')
    return final_tokens

df['roomtype_tokens'] = df['roomtype'].apply(preprocess_roomtype)

# --- Step 2: Common Token Filtering (for roomtype tokens) ---
token_counts = {}
for tokens_set in df['roomtype_tokens']:
    for token in tokens_set:
        token_counts[token] = token_counts.get(token, 0) + 1

total_hotels = len(df)

common_token_threshold = 0.70
tokens_to_remove = {token for token, count in token_counts.items() if count / total_hotels >= common_token_threshold}

df['roomtype_tokens_filtered'] = df['roomtype_tokens'].apply(
    lambda tokens_set: {token for token in tokens_set if token not in tokens_to_remove}
)

# --- Step 3: Output for Step 1 (for roomtype) ---
df['roomtype_tokens_str'] = df['roomtype_tokens_filtered'].apply(lambda x: '_'.join(sorted(list(x))))

output_columns = ['itemid', 'hotelid', 'roomtype', 'roomtype_tokens', 'roomtype_tokens_filtered', 'roomtype_tokens_str']
df_output = df[output_columns]

df_output.to_csv('Group4_Part1_preprocessed_roomtype.csv', index=False)
print("Saved preprocessed data for 'roomtype' (selected columns) to 'Group4_Part1_preprocessed_roomtype.csv'")

df_output_mapping = df_output[['itemid', 'roomtype_tokens_str']]
df_output_mapping = df_output_mapping.rename(columns={'roomtype_tokens_str': 'tokens'})
df_output_mapping.to_csv('Group4_Part1_id_tokens_roomtype.csv', index=False)

print("Saved minimal mapping for 'roomtype' to 'Group4_Part1_id_tokens_roomtype.csv'")
print("\nPreprocessing for 'roomtype' column completed.")
print("\nFirst 5 rows of the preprocessed data (with selected columns):")
print(df_output[['itemid', 'roomtype', 'roomtype_tokens', 'roomtype_tokens_filtered', 'roomtype_tokens_str']].head())

Saved preprocessed data for 'roomtype' (selected columns) to 'Group4_Part1_preprocessed_roomtype.csv'
Saved minimal mapping for 'roomtype' to 'Group4_Part1_id_tokens_roomtype.csv'

Preprocessing for 'roomtype' column completed.

First 5 rows of the preprocessed data (with selected columns):
   itemid                     roomtype          roomtype_tokens  \
0       1          Comfort Double Room        {comfort, double}   
1       2  Classic Double or Twin Room  {twin, classic, double}   
2       3         Superior Double Room       {superior, double}   
3       4                        Suite                  {suite}   
4       5  Classic Double or Twin Room  {twin, classic, double}   

  roomtype_tokens_filtered  roomtype_tokens_str  
0        {comfort, double}       comfort_double  
1  {twin, classic, double}  classic_double_twin  
2       {superior, double}      double_superior  
3                  {suite}                suite  
4  {twin, classic, double}  classic_double_twin  


In [1]:
import pandas as pd
import re

# Load the dataset
try:
    df = pd.read_csv('HOTEL_OUTDATASET.CSV')
except FileNotFoundError:
    print("Error: 'HOTEL_DATASET.CSV' not found. Please make sure the file is in the correct directory.")
    exit()

# --- Step 1: Preprocessing for 'roomtype' column ---

def preprocess_roomtype(roomtype_str):
    if pd.isna(roomtype_str):
        return set() # Return an empty set for missing values

    # 2. Lowercase
    tokens = str(roomtype_str).lower()

    # 3. Replace spaces with underscores
    tokens = tokens.replace(' ', '_')

    # 4. Remove leading/trailing punctuation and split into potential tokens
    potential_tokens = re.split(r'[;,/_\-]+', tokens)

    # Define stop words to remove (common words, qualifiers, and view-related terms)
    stop_words = {
        'with', 'only', 'use', 'or', 'of', 'room', 'deluxe', 'superior', 'classic', 'comfort', 'balcony',
        'city', 'garden', 'mountain', 'street', 'ocean', 'lake', 'view', 'terraced'
    } # Added view-related terms

    processed_tokens = set()
    has_double = False
    has_twin = False
    has_single = False
    other_tokens = set()

    for token in potential_tokens:
        cleaned_token = re.sub(r'^[^\w]+|[^\w]+$', '', token)

        if cleaned_token and len(cleaned_token) > 1:
            if cleaned_token == 'double':
                has_double = True
            elif cleaned_token == 'twin':
                has_twin = True
            elif cleaned_token == 'single':
                has_single = True
            elif cleaned_token not in stop_words:
                other_tokens.add(cleaned_token)

    # Now, construct the final set based on the presence of 'double'
    final_tokens = set()
    if has_double:
        final_tokens.add('double')
        if has_twin:
            final_tokens.add('twin')
    else: # No "double" found
        if has_single:
            final_tokens.add('single')
        if has_twin:
            final_tokens.add('twin')
        # Add any other meaningful tokens that are not stop words
        for token in other_tokens:
            final_tokens.add(token)

    # Special case: If after all this, the set is empty and the original string contained "room",
    # treat it as "single".
    if not final_tokens and any(t.lower() == 'room' for t in str(roomtype_str).lower().split()):
         final_tokens.add('single')
    elif not final_tokens and not has_double and not has_single and not has_twin and other_tokens:
        # If no specific type was identified, but there were other tokens, add them
        final_tokens.update(other_tokens)

    return final_tokens

# Apply the preprocessing function to the 'roomtype' column
df['roomtype_tokens'] = df['roomtype'].apply(preprocess_roomtype)

# --- Step 3: Common Token Filtering (for roomtype tokens) ---

# Calculate token frequencies
token_counts = {}
for tokens_set in df['roomtype_tokens']:
    for token in tokens_set:
        token_counts[token] = token_counts.get(token, 0) + 1

# Determine the total number of hotels
total_hotels = len(df)

# Filter out common tokens (appearing in >= 70% of hotels)
common_token_threshold = 0.70
tokens_to_remove = {token for token, count in token_counts.items() if count / total_hotels >= common_token_threshold}

# Remove common tokens from the token sets
df['roomtype_tokens_filtered'] = df['roomtype_tokens'].apply(
    lambda tokens_set: {token for token in tokens_set if token not in tokens_to_remove}
)

# --- Step 5: Output for Step 1 (for roomtype) ---

# Create the 'roomtype_tokens_str' column
df['roomtype_tokens_str'] = df['roomtype_tokens_filtered'].apply(lambda x: '_'.join(sorted(list(x))))

# Select only the specified columns for the output file
output_columns = ['itemid', 'hotelid', 'roomtype', 'roomtype_tokens', 'roomtype_tokens_filtered', 'roomtype_tokens_str']
df_output = df[output_columns]

df_output.to_csv('Group4_Part1_preprocessed_roomtype.csv', index=False)
print("Saved preprocessed data for 'roomtype' (selected columns) to 'Group4_Part1_preprocessed_roomtype.csv'")

# Create the minimal mapping (itemid, token_str)
df_output_mapping = df_output[['itemid', 'roomtype_tokens_str']]
# Rename the token column to 'tokens' as per the example
df_output_mapping = df_output_mapping.rename(columns={'roomtype_tokens_str': 'tokens'})
df_output_mapping.to_csv('Group4_Part1_id_tokens_roomtype.csv', index=False)
print("Saved minimal mapping for 'roomtype' to 'Group4_Part1_id_tokens_roomtype.csv'")

print("\nPreprocessing for 'roomtype' column completed.")
print("\nFirst 5 rows of the preprocessed data (with selected columns):")
print(df_output[['itemid', 'roomtype', 'roomtype_tokens', 'roomtype_tokens_filtered', 'roomtype_tokens_str']].head())

Saved preprocessed data for 'roomtype' (selected columns) to 'Group4_Part1_preprocessed_roomtype.csv'
Saved minimal mapping for 'roomtype' to 'Group4_Part1_id_tokens_roomtype.csv'

Preprocessing for 'roomtype' column completed.

First 5 rows of the preprocessed data (with selected columns):
   itemid                     roomtype roomtype_tokens  \
0       1          Comfort Double Room        {double}   
1       2  Classic Double or Twin Room  {double, twin}   
2       3         Superior Double Room        {double}   
3       4                        Suite         {suite}   
4       5  Classic Double or Twin Room  {double, twin}   

  roomtype_tokens_filtered roomtype_tokens_str  
0                 {double}              double  
1           {double, twin}         double_twin  
2                 {double}              double  
3                  {suite}               suite  
4           {double, twin}         double_twin  
