## Task 1.4: Categorical Encoding

Transforming categorical variables into numerical representations is essential for machine learning algorithms. 
We applied a tailored encoding strategy: One-Hot Encoding for room_type (4 binary columns), 
Label + Frequency Encoding for property_type, and Target + Frequency + Label encoding for neighbourhood. 
The target variable value_category was mapped to ordinal integers (0, 1, 2). 
This expanded our feature space by 10 new columns while maintaining data integrity.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('='*80)
print('TASK 1.4: CATEGORICAL ENCODING')
print('San Francisco & San Diego Airbnb Dataset')
print('='*80)

### 1. Load Data

In [None]:
df = pd.read_csv('../../data/processed/listings_with_algebraic_features.csv')
print(f'\nLoaded dataset: {df.shape}')
print(f'Rows: {df.shape[0]:,}')
print(f'Columns: {df.shape[1]}')

### 2. Load and Merge Categorical Columns from Raw Data

In [None]:
print('\nLoading raw data to retrieve categorical columns...')
sf_raw = pd.read_csv('../../data/raw/san francisco.csv')
sd_raw = pd.read_csv('../../data/raw/san diego.csv')

raw_combined = pd.concat([sf_raw, sd_raw], ignore_index=True)
print(f'Combined raw data: {raw_combined.shape}')

# Check if columns exist in raw data
print('\nChecking for categorical columns in raw data...')
for col in ['id', 'property_type', 'room_type', 'neighbourhood_cleansed']:
    if col in raw_combined.columns:
        print(f'   {col} found')
    else:
        print(f'   {col} NOT found')
        # Try to find similar column names
        similar = [c for c in raw_combined.columns if col.lower() in c.lower()]
        if similar:
            print(f'    Similar columns: {similar}')

# Select only existing categorical columns
categorical_cols_to_merge = ['id']
for col in ['property_type', 'room_type', 'neighbourhood_cleansed']:
    if col in raw_combined.columns:
        categorical_cols_to_merge.append(col)

categorical_cols = raw_combined[categorical_cols_to_merge]

# Check if 'id' exists in df
if 'id' not in df.columns:
    print('\nERROR: "id" column not found in df!')
    print(f'Available columns in df: {df.columns.tolist()[:10]}...')
else:
    # Merge
    df = df.merge(categorical_cols, on='id', how='left')
    print(f'\nAfter merging categorical columns: {df.shape}')
    
    # Display categorical variables
    print('\nCategorical variables to encode:')
    if 'room_type' in df.columns:
        print(f'  room_type: {df["room_type"].nunique()} unique values')
    if 'property_type' in df.columns:
        print(f'  property_type: {df["property_type"].nunique()} unique values')
    if 'neighbourhood_cleansed' in df.columns:
        print(f'  neighbourhood_cleansed: {df["neighbourhood_cleansed"].nunique()} unique values')
    if 'value_category' in df.columns:
        print(f'  value_category: {df["value_category"].nunique()} unique values')

### 3. Categorical Encoding

In [None]:
print('\n' + '='*80)
print('PERFORMING CATEGORICAL ENCODING')
print('='*80)

# Handle the _x and _y suffixes from merge
print('\nHandling merged column suffixes...')
if 'property_type_x' in df.columns and 'property_type_y' in df.columns:
    # Keep the _y version (from raw data) and drop _x
    df['property_type'] = df['property_type_y']
    df = df.drop(['property_type_x', 'property_type_y'], axis=1)
    print('   Resolved property_type duplicates')

if 'neighbourhood_cleansed_x' in df.columns and 'neighbourhood_cleansed_y' in df.columns:
    # Keep the _y version (from raw data) and drop _x
    df['neighbourhood_cleansed'] = df['neighbourhood_cleansed_y']
    df = df.drop(['neighbourhood_cleansed_x', 'neighbourhood_cleansed_y'], axis=1)
    print('   Resolved neighbourhood_cleansed duplicates')

print(f'\nDataframe shape after cleanup: {df.shape}')

# Verify categorical columns exist
required_cols = ['room_type', 'property_type', 'neighbourhood_cleansed', 'value_category']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f'\nERROR: Missing required columns: {missing_cols}')
    print(f'Available columns: {df.columns.tolist()}')
else:
    print(f' All required columns present\n')

le_property = LabelEncoder()
le_neighbourhood = LabelEncoder()

# 1. Room Type - One-Hot Encoding
print('1. Encoding room_type (One-Hot Encoding)...')
# Drop existing room_type dummy columns if they exist
existing_room_dummies = [col for col in df.columns if col.startswith('room_type_')]
if existing_room_dummies:
    df = df.drop(existing_room_dummies, axis=1)
    print(f'   Dropped existing dummy columns: {existing_room_dummies}')

room_dummies = pd.get_dummies(df['room_type'], prefix='room_type', drop_first=False)
df = pd.concat([df, room_dummies], axis=1)
print(f'   Created {len(room_dummies.columns)} columns: {list(room_dummies.columns)}')

# 2. Property Type - Label + Frequency Encoding
print('\n2. Encoding property_type (Label + Frequency Encoding)...')
df['property_type_label'] = le_property.fit_transform(df['property_type'])
df['property_type_frequency'] = df['property_type'].map(
    df['property_type'].value_counts(normalize=True)
)
print(f'   Created 2 columns:')
print(f'     property_type_label: Range 0-{df["property_type_label"].max()}')
print(f'     property_type_frequency: Range {df["property_type_frequency"].min():.4f}-{df["property_type_frequency"].max():.4f}')

# 3. Neighbourhood - Target + Frequency + Label Encoding
print('\n3. Encoding neighbourhood_cleansed (Target + Frequency + Label)...')
value_mapping = {'Poor_Value': 0, 'Fair_Value': 1, 'Excellent_Value': 2}
df['value_encoded'] = df['value_category'].map(value_mapping)

neighbourhood_target = df.groupby('neighbourhood_cleansed')['value_encoded'].mean()
df['neighbourhood_target_encoded'] = df['neighbourhood_cleansed'].map(neighbourhood_target)

df['neighbourhood_frequency'] = df['neighbourhood_cleansed'].map(
    df['neighbourhood_cleansed'].value_counts(normalize=True)
)

df['neighbourhood_label'] = le_neighbourhood.fit_transform(df['neighbourhood_cleansed'])
print(f'   Created 3 columns:')
print(f'     neighbourhood_target_encoded: Range {df["neighbourhood_target_encoded"].min():.4f}-{df["neighbourhood_target_encoded"].max():.4f}')
print(f'     neighbourhood_frequency: Range {df["neighbourhood_frequency"].min():.4f}-{df["neighbourhood_frequency"].max():.4f}')
print(f'     neighbourhood_label: Range 0-{df["neighbourhood_label"].max()}')

# 4. Value Category - Already encoded as value_encoded
print('\n4. Value category encoded as value_encoded (0=Poor, 1=Fair, 2=Excellent)')
print(f'   Distribution:')
for val, label in [(0, 'Poor_Value'), (1, 'Fair_Value'), (2, 'Excellent_Value')]:
    count = (df['value_encoded'] == val).sum()
    pct = (count / len(df)) * 100
    print(f'     {val} ({label}): {count:,} ({pct:.2f}%)')

print('\n' + '='*80)
print('ENCODING COMPLETE')
print('='*80)
print(f'Total new encoded columns: 10')
print(f'Final dataframe shape: {df.shape}')

### 4. Data Quality Check

In [None]:
print('\nData Quality Check:')

duplicate_cols = df.columns[df.columns.duplicated()].tolist()
if duplicate_cols:
    print(f'  WARNING: Duplicate columns found: {duplicate_cols}')
    print(f'  Removing duplicate columns...')
    df = df.loc[:, ~df.columns.duplicated()]
    print(f'  Duplicates removed. New shape: {df.shape}')

all_clean = True
new_encoding_cols = [
    'room_type_Entire home/apt', 'room_type_Hotel room',
    'room_type_Private room', 'room_type_Shared room',
    'property_type_label', 'property_type_frequency',
    'neighbourhood_label', 'neighbourhood_target_encoded',
    'neighbourhood_frequency', 'value_encoded'
]

for col in new_encoding_cols:
    if col in df.columns:
        missing = int(df[col].isna().sum())
        if missing > 0:
            print(f'  WARNING: {col}: {missing} missing values')
            all_clean = False

if all_clean:
    print('  All encoded columns are complete (no missing values)')

### 5. Save Encoded Dataset

In [None]:
output_path = '../../data/processed/listings_with_categorical_encoding.csv'
df.to_csv(output_path, index=False)
print(f'\nSaved encoded dataset to: {output_path}')
print(f'Shape: {df.shape[0]:,} rows x {df.shape[1]} columns')

### 6. Create Encoding Reference Files

In [None]:
print('\nCreating encoding reference files...')

# Property type mapping
property_mapping = pd.DataFrame({
    'property_type': le_property.classes_,
    'label': range(len(le_property.classes_))
})
property_mapping = property_mapping.merge(
    df.groupby('property_type')['property_type_frequency'].first().reset_index(),
    on='property_type'
)
property_mapping = property_mapping.merge(
    df['property_type'].value_counts().reset_index().rename(columns={'count': 'count'}),
    on='property_type'
)
property_mapping = property_mapping.sort_values('count', ascending=False)
property_mapping.to_csv('../../outputs/property_type_encoding_map.csv', index=False)
print('  Saved: outputs/property_type_encoding_map.csv')

# Neighbourhood mapping
neighbourhood_mapping = pd.DataFrame({
    'neighbourhood': le_neighbourhood.classes_,
    'label': range(len(le_neighbourhood.classes_))
})
neighbourhood_mapping = neighbourhood_mapping.merge(
    df.groupby('neighbourhood_cleansed').agg({
        'neighbourhood_target_encoded': 'first',
        'neighbourhood_frequency': 'first'
    }).reset_index(),
    left_on='neighbourhood',
    right_on='neighbourhood_cleansed'
).drop('neighbourhood_cleansed', axis=1)
neighbourhood_mapping = neighbourhood_mapping.merge(
    df['neighbourhood_cleansed'].value_counts().reset_index().rename(columns={'count': 'count'}),
    left_on='neighbourhood',
    right_on='neighbourhood_cleansed'
).drop('neighbourhood_cleansed', axis=1)
neighbourhood_mapping = neighbourhood_mapping.sort_values('count', ascending=False)
neighbourhood_mapping.to_csv('../../outputs/neighbourhood_encoding_map.csv', index=False)
print('  Saved: outputs/neighbourhood_encoding_map.csv')

### 7. Encoding Statistics

In [None]:
print('\n' + '='*80)
print('ENCODING STATISTICS')
print('='*80)

encoding_stats = {
    'Variable': ['room_type', 'property_type', 'neighbourhood', 'value_category'],
    'Original_Cardinality': [4, df['property_type'].nunique(), df['neighbourhood_cleansed'].nunique(), 3],
    'Encoding_Method': ['One-Hot', 'Label + Frequency', 'Target + Frequency + Label', 'Label (Ordinal)'],
    'Columns_Created': [4, 2, 3, 1]
}

stats_df = pd.DataFrame(encoding_stats)
print('\n' + stats_df.to_string(index=False))

stats_df.to_csv('../../outputs/categorical_encoding_statistics.csv', index=False)
print('\nSaved: outputs/categorical_encoding_statistics.csv')

### 8. Visualizations

In [None]:
print('\n' + '='*80)
print('CREATING VISUALIZATIONS')
print('='*80)

# Figure 1: Categorical Encoding Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Categorical Encoding Analysis', fontsize=16, fontweight='bold')

# Room Type Distribution
room_type_data = df[['room_type_Entire home/apt', 'room_type_Hotel room',
                    'room_type_Private room', 'room_type_Shared room']].sum()
axes[0, 0].bar(range(len(room_type_data)), room_type_data.values, color='skyblue')
axes[0, 0].set_xticks(range(len(room_type_data)))
axes[0, 0].set_xticklabels(['Entire home/apt', 'Hotel room', 'Private room', 'Shared room'],
                          rotation=45, ha='right')
axes[0, 0].set_title('Room Type Distribution (One-Hot Encoded)')
axes[0, 0].set_ylabel('Count')

# Property Type Frequency Distribution
axes[0, 1].hist(df['property_type_frequency'], bins=30, color='coral', edgecolor='black')
axes[0, 1].set_title('Property Type Frequency Distribution')
axes[0, 1].set_xlabel('Frequency')
axes[0, 1].set_ylabel('Count')

# Neighbourhood Target Encoding Distribution
axes[1, 0].hist(df['neighbourhood_target_encoded'], bins=30, color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Neighbourhood Target Encoding Distribution')
axes[1, 0].set_xlabel('Target Encoded Value')
axes[1, 0].set_ylabel('Count')

# Value Category Distribution
value_counts = df['value_encoded'].value_counts().sort_index()
axes[1, 1].bar(['Poor Value', 'Fair Value', 'Excellent Value'], value_counts.values,
               color=['#ff6b6b', '#ffd93d', '#6bcf7f'])
axes[1, 1].set_title('Value Category Distribution (Label Encoded)')
axes[1, 1].set_ylabel('Count')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../../outputs/figures/categorical_encoding_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print('\nSaved: outputs/figures/categorical_encoding_analysis.png')

# Figure 2: Encoding Methods Comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Encoding Methods Comparison', fontsize=16, fontweight='bold')

# Cardinality Comparison
variables = ['room_type', 'property_type', 'neighbourhood', 'value_category']
cardinalities = [4, df['property_type'].nunique(), df['neighbourhood_cleansed'].nunique(), 3]
colors_card = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
axes[0, 0].barh(variables, cardinalities, color=colors_card)
axes[0, 0].set_title('Original Cardinality by Variable')
axes[0, 0].set_xlabel('Number of Unique Categories')

# Encoding Methods Used
methods = ['One-Hot', 'Label +\nFrequency', 'Target +\nFrequency +\nLabel', 'Label\n(Ordinal)']
columns_created = [4, 2, 3, 1]
axes[0, 1].bar(range(len(methods)), columns_created, color=colors_card)
axes[0, 1].set_xticks(range(len(methods)))
axes[0, 1].set_xticklabels(methods)
axes[0, 1].set_title('Columns Created by Encoding Method')
axes[0, 1].set_ylabel('Number of Columns')

# Property Type - Top 10
top_properties = df['property_type'].value_counts().head(10)
axes[1, 0].barh(range(len(top_properties)), top_properties.values, color='steelblue')
axes[1, 0].set_yticks(range(len(top_properties)))
axes[1, 0].set_yticklabels(top_properties.index, fontsize=9)
axes[1, 0].set_title('Top 10 Property Types')
axes[1, 0].set_xlabel('Count')
axes[1, 0].invert_yaxis()

# Neighbourhood - Top 10
top_neighbourhoods = df['neighbourhood_cleansed'].value_counts().head(10)
axes[1, 1].barh(range(len(top_neighbourhoods)), top_neighbourhoods.values, color='mediumseagreen')
axes[1, 1].set_yticks(range(len(top_neighbourhoods)))
axes[1, 1].set_yticklabels(top_neighbourhoods.index, fontsize=9)
axes[1, 1].set_title('Top 10 Neighbourhoods')
axes[1, 1].set_xlabel('Count')
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.savefig('../../outputs/figures/encoding_methods_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: outputs/figures/encoding_methods_comparison.png')

### 9. Summary Report

In [None]:
print('\n' + '='*80)
print('TASK 1.4 SUMMARY REPORT')
print('='*80)

summary = f"""
CATEGORICAL ENCODING COMPLETED
{'='*80}

DATASET:
  - Final shape: {df.shape[0]:,} rows x {df.shape[1]} columns
  - Added features: 10

ENCODING BREAKDOWN:
  1. room_type (One-Hot): 4 columns
  2. property_type (Label + Frequency): 2 columns
  3. neighbourhood (Target + Frequency + Label): 3 columns
  4. value_category (Label Ordinal): 1 column

DATA QUALITY:
  - No missing values in encoded columns
  - No duplicate columns
  - All encodings properly applied

OUTPUT FILES:
  - data/processed/listings_with_categorical_encoding.csv
  - outputs/property_type_encoding_map.csv
  - outputs/neighbourhood_encoding_map.csv
  - outputs/categorical_encoding_statistics.csv
  - outputs/figures/categorical_encoding_analysis.png
  - outputs/figures/encoding_methods_comparison.png



{'='*80}
"""

print(summary)

import os
os.makedirs('../../outputs/reports', exist_ok=True)
with open('../../outputs/reports/T1.4_summary.txt', 'w') as f:
    f.write(summary)

print('Summary saved to: outputs/reports/T1.4_summary.txt')