# 2. Categorical Variable Encoding
This notebook encodes categorical variables using various techniques including Label Encoding, One-Hot Encoding, and Target Encoding.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load the dataset (after missing data handling)
try:
    df = pd.read_csv('data_after_missing_handling.csv')
    print("Loaded dataset after missing data handling")
except FileNotFoundError:
    df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')
    print("Loaded original dataset")

print(f"Dataset shape: {df.shape}")
print("\nDataset info:")
print(df.info())

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Analyze categorical variables
print("\nCategorical variable analysis:")
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"\n{col}: {unique_count} unique values")
    print(f"Values: {df[col].value_counts().head()}")

In [None]:
# Visualize categorical distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for i, col in enumerate(categorical_cols[:4]):
    if i < len(axes):
        df[col].value_counts().head(10).plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
def encode_categorical_variables(df):
    """
    Encode categorical variables using different strategies:
    1. Label Encoding for ordinal variables
    2. One-Hot Encoding for nominal variables with few categories
    3. Target Encoding for high-cardinality nominal variables
    """
    df_encoded = df.copy()
    
    # Define encoding strategies based on variable characteristics
    ordinal_vars = ['owner']  # Has natural order: First < Second < Third < Fourth & Above
    low_cardinality_vars = ['fuel', 'seller_type', 'transmission']  # Few categories, use one-hot
    high_cardinality_vars = ['name']  # Many categories, use target encoding or drop
    
    print("=== ENCODING CATEGORICAL VARIABLES ===")
    
    # 1. Label Encoding for ordinal variables
    if 'owner' in df_encoded.columns:
        # Define custom order for owner variable
        owner_order = ['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 
                      'Test Drive Car']
        
        # Create mapping dictionary
        owner_mapping = {owner: i for i, owner in enumerate(owner_order)}
        
        # Handle any unknown categories
        df_encoded['owner_encoded'] = df_encoded['owner'].map(owner_mapping)
        df_encoded['owner_encoded'] = df_encoded['owner_encoded'].fillna(-1)  # Unknown category
        
        print(f"✓ Label encoded 'owner': {owner_mapping}")
    
    # 2. One-Hot Encoding for low cardinality nominal variables
    for var in low_cardinality_vars:
        if var in df_encoded.columns:
            # Create dummy variables
            dummies = pd.get_dummies(df_encoded[var], prefix=var, drop_first=True)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
            print(f"✓ One-hot encoded '{var}': {dummies.columns.tolist()}")
    
    # 3. Target Encoding for high cardinality variables (car names)
    if 'name' in df_encoded.columns and 'selling_price' in df_encoded.columns:
        # Calculate mean selling price by car name
        name_target_encoding = df_encoded.groupby('name')['selling_price'].mean().to_dict()
        df_encoded['name_target_encoded'] = df_encoded['name'].map(name_target_encoding)
        print(f"✓ Target encoded 'name' using mean selling_price")
        
        # Also create a frequency encoding
        name_frequency = df_encoded['name'].value_counts().to_dict()
        df_encoded['name_frequency_encoded'] = df_encoded['name'].map(name_frequency)
        print(f"✓ Frequency encoded 'name' (car model popularity)")
    
    return df_encoded

# Apply categorical encoding
df_encoded = encode_categorical_variables(df)

print(f"\nDataset shape after encoding: {df_encoded.shape}")
print(f"New columns added: {set(df_encoded.columns) - set(df.columns)}")

In [None]:
# Alternative encoding methods demonstration
def alternative_encoding_methods(df):
    """
    Demonstrate alternative encoding methods:
    - Binary Encoding
    - Hash Encoding
    - Leave One Out Encoding
    """
    df_alt = df.copy()
    
    # Binary encoding (manual implementation for demonstration)
    if 'fuel' in df_alt.columns:
        fuel_categories = df_alt['fuel'].unique()
        n_bits = int(np.ceil(np.log2(len(fuel_categories))))
        
        # Create binary representation
        fuel_mapping = {cat: format(i, f'0{n_bits}b') for i, cat in enumerate(fuel_categories)}
        
        for bit in range(n_bits):
            df_alt[f'fuel_binary_{bit}'] = df_alt['fuel'].map(
                lambda x: int(fuel_mapping[x][bit]) if x in fuel_mapping else 0
            )
        
        print(f"Binary encoding for fuel: {fuel_mapping}")
    
    return df_alt

# Demonstrate alternative methods
print("\n=== ALTERNATIVE ENCODING METHODS ===")
df_alt_encoded = alternative_encoding_methods(df)
print(f"Binary encoded columns: {[col for col in df_alt_encoded.columns if 'binary' in col]}")

In [None]:
# Analyze encoding results
print("\n=== ENCODING ANALYSIS ===")

# Check correlations between encoded variables and target
if 'selling_price' in df_encoded.columns:
    encoded_cols = [col for col in df_encoded.columns if any(x in col for x in ['_encoded', 'fuel_', 'seller_type_', 'transmission_'])]
    
    correlations = df_encoded[encoded_cols + ['selling_price']].corr()['selling_price'].drop('selling_price').sort_values(key=abs, ascending=False)
    
    print("\nCorrelations with selling_price:")
    print(correlations)
    
    # Visualize correlations
    plt.figure(figsize=(10, 6))
    correlations.plot(kind='barh')
    plt.title('Correlation of Encoded Variables with Selling Price')
    plt.xlabel('Correlation Coefficient')
    plt.tight_layout()
    plt.show()

In [None]:
# Create final encoded dataset (drop original categorical columns)
def create_final_encoded_dataset(df_encoded):
    """
    Create final dataset with encoded variables, dropping original categorical columns.
    """
    df_final = df_encoded.copy()
    
    # Drop original categorical columns
    original_categorical = ['name', 'fuel', 'seller_type', 'transmission', 'owner']
    cols_to_drop = [col for col in original_categorical if col in df_final.columns]
    
    df_final = df_final.drop(columns=cols_to_drop)
    
    print(f"Dropped original categorical columns: {cols_to_drop}")
    print(f"Final dataset shape: {df_final.shape}")
    print(f"Final columns: {df_final.columns.tolist()}")
    
    return df_final

df_final_encoded = create_final_encoded_dataset(df_encoded)

# Save the encoded dataset
df_final_encoded.to_csv('data_after_encoding.csv', index=False)
print("\nEncoded dataset saved as 'data_after_encoding.csv'")

# Summary
print("\n=== CATEGORICAL ENCODING SUMMARY ===")
print("✓ Applied Label Encoding for ordinal variables (owner)")
print("✓ Applied One-Hot Encoding for nominal variables (fuel, seller_type, transmission)")
print("✓ Applied Target Encoding for high-cardinality variables (car names)")
print("✓ Applied Frequency Encoding for model popularity")
print("✓ Demonstrated Binary Encoding as alternative method")
print("✓ Analyzed correlations with target variable")
print("✓ Created final encoded dataset")