## Part 2: Numerical Feature Engineering
#### 1. Setup and Load Training and Test Data
- Import libraries.
- Define file path and load data: train_data and test_data from Part #1.
- Initial checks: Shape of the loaded data and ensure the 'date' column is parsed correctly into datetime objects for calculating age-related features.


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

print("Part 2: Numerical Feature Engineering")

# Load data
train_data_path = r"./train_data.csv"
test_data_path = r"./test_data.csv"

try:
    df_train = pd.read_csv(train_data_path)
    print(f"Training data loaded successfully from {train_data_path}.")
    print(f"Shape of the training dataset: {df_train.shape}")
except FileNotFoundError:
    print(f"Error: Training data file not found at {train_data_path}")
    exit()

try:
    df_test = pd.read_csv(test_data_path)
    print(f"Test data loaded successfully from {test_data_path}.")
    print(f"Shape of the test dataset: {df_test.shape}")
except FileNotFoundError:
    print(f"Error: Test data file not found at {test_data_path}")
    exit()

# Ensure 'date' column is parsed correctly if needed for age calculation
for df, name in zip([df_train, df_test], ["train", "test"]):
    try:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        print(f"'date' column parsed to datetime for {name} set.")
    except KeyError:
        print(f"Warning: 'date' column not found in {name}_data.csv.")
    except Exception as e:
        print(f"Warning: Could not parse 'date' column in {name} set. Error: {e}")

Part 2: Numerical Feature Engineering
Training data loaded successfully from ./train_data.csv.
Shape of the training dataset: (3641, 19)
Test data loaded successfully from ./test_data.csv.
Shape of the test dataset: (911, 19)
'date' column parsed to datetime for train set.
'date' column parsed to datetime for test set.


### 2. Define Numerical Feature Engineering Function
- Define the logic for creating new features derived only from the existing numerical columns.

- Take the training df as input and output a new df to contain only the engineered numerical features.

- Preserve the original index for later merging.

- Feature Categories:

   - Calculate domain-specific features (e.g., house age, time since renovation, etc.).
   - Transform numerical features that might be skewed. Help linear models perform better.
   - Interactions/Polynomials: Create features for combined effects (e.g., bedrooms * bathrooms) or non-linear relationships (e.g., sqft_living^2).

- The same function is applied to the test dataset to ensure consistent feature representation across datasets.

In [11]:
# Define Numerical Feature Engineering Function

def engineer_numerical_features(df_input):
    print("\nStart numerical feature engineering...")
    df = df_input.copy() # Work on a copy to protect the original df
    engineered_features = pd.DataFrame(index=df.index) # Create df for new features

    # Domain-Specific Features

    # House Age & Renovation Features
    if 'date' in df.columns and pd.api.types.is_datetime64_any_dtype(df['date']) and 'yr_built' in df.columns:
        current_year = df['date'].dt.year
        engineered_features['house_age'] = current_year - df['yr_built']
        engineered_features['house_age'] = engineered_features['house_age'].apply(lambda x: max(0, x if pd.notnull(x) else 0)) # Handle NaN
        print("Created 'house_age'.")

        if 'yr_renovated' in df.columns:
            # Calculate years since last build or renovation
            last_action_year = df[['yr_built', 'yr_renovated']].max(axis=1)
            # Handle cases where yr_renovated might be 0
            last_action_year = np.where((df['yr_renovated'] > 0) & (df['yr_renovated'] < df['yr_built']),
                                       df['yr_built'], last_action_year)
            last_action_year = np.where(df['yr_renovated'] == 0, df['yr_built'], last_action_year)

            engineered_features['years_since_last_action'] = current_year - last_action_year
            engineered_features['years_since_last_action'] = engineered_features['years_since_last_action'].apply(lambda x: max(0, x if pd.notnull(x) else 0))

            # Binary flag for renovation
            engineered_features['was_renovated'] = (df['yr_renovated'] > 0).astype(int)
            print("Created 'years_since_last_action' and 'was_renovated'.")


    # Sqft ratios and combinations
    if 'sqft_living' in df.columns and 'floors' in df.columns:
         engineered_features['sqft_living_per_floor'] = np.where(
            (df['floors'] > 0) & pd.notnull(df['floors']) & pd.notnull(df['sqft_living']),
            df['sqft_living'] / df['floors'],
            df['sqft_living']
         )
         print("Created 'sqft_living_per_floor'.")

    if 'sqft_basement' in df.columns and 'sqft_living' in df.columns:
        engineered_features['sqft_basement_ratio'] = np.where(
            (df['sqft_living'] > 0) & pd.notnull(df['sqft_living']) & pd.notnull(df['sqft_basement']),
            df['sqft_basement'] / df['sqft_living'],
            0 # Assign 0 if sqft_living is 0 or NaN
        )
        print("Created 'sqft_basement_ratio'.")

    if 'sqft_living' in df.columns and 'sqft_lot' in df.columns:
        engineered_features['sqft_living_lot_ratio'] = np.where(
            (df['sqft_lot'] > 0) & pd.notnull(df['sqft_lot']) & pd.notnull(df['sqft_living']),
            df['sqft_living'] / df['sqft_lot'],
            0 # Assign 0 if sqft_lot is 0 or NaN
        )
        print("Created 'sqft_living_lot_ratio'.")

    # Total Rooms Approximation
    if 'bedrooms' in df.columns and 'bathrooms' in df.columns:
         engineered_features['total_rooms_approx'] = df['bedrooms'].fillna(0) + df['bathrooms'].fillna(0) + 1 # Handle NaNs
         print("Created 'total_rooms_approx'.")


    # Transformations
    # Log transform skewed numerical features
    # Apply log1p (log(1+x)) for potential zeros
    skewed_features = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']
    for col in skewed_features:
        if col in df.columns:
             # Ensure column is numeric and non-negative before log transform
            if pd.api.types.is_numeric_dtype(df[col]):
                 # Fill NaNs with 0 before log1p
                 feature_col = df[col].fillna(0)
                 if (feature_col >= 0).all():
                     engineered_features[f'log_{col}'] = np.log1p(feature_col)
                     print(f"Created 'log_{col}'.")
                 else:
                     print(f"Skipping log transform for '{col}' due to negative values after fillna(0).")
            else:
                 print(f"Skipping log transform for '{col}' due to non-numeric type.")


    # Polynomial Features & Interactions

    if 'sqft_living' in df.columns:
         # Simple Polynomial: Square of sqft_living
        engineered_features['sqft_living_sq'] = df['sqft_living'].fillna(0)**2 # Handle NaNs
        print("Created 'sqft_living_sq'.")

    # Interaction: Bedrooms * Bathrooms
    if 'bedrooms' in df.columns and 'bathrooms' in df.columns:
        engineered_features['bed_bath_interaction'] = df['bedrooms'].fillna(0) * df['bathrooms'].fillna(0) # Handle NaNs
        print("Created 'bed_bath_interaction'.")

    # Interaction: Age * Sqft Living
    if 'house_age' in engineered_features.columns and 'sqft_living' in df.columns:
         engineered_features['age_sqft_interaction'] = engineered_features['house_age'].fillna(0) * df['sqft_living'].fillna(0) # Handle NaNs
         print("Created 'age_sqft_interaction'.")

    print(f"\nFinished numerical feature engineering. Total new features: {engineered_features.shape[1]}")
    return engineered_features

### 3. Apply Feature Engineering Function to Training Data
- Execute the engineer_numerical_features() function on both `df_train` and `df_test`.
- Store the outputs in two new DataFrames:
  - `df_engineered_numerical_train` — containing the engineered features from the training data.
  - `df_engineered_numerical_test` — containing the engineered features from the test data.

In [12]:
# --- Apply Function to Training Data ---
df_engineered_numerical_train = engineer_numerical_features(df_train)

# --- Apply Function to Test Data ---
df_engineered_numerical_test = engineer_numerical_features(df_test)


Start numerical feature engineering...
Created 'house_age'.
Created 'years_since_last_action' and 'was_renovated'.
Created 'sqft_living_per_floor'.
Created 'sqft_basement_ratio'.
Created 'sqft_living_lot_ratio'.
Created 'total_rooms_approx'.
Created 'log_sqft_living'.
Created 'log_sqft_lot'.
Created 'log_sqft_above'.
Created 'log_sqft_basement'.
Created 'sqft_living_sq'.
Created 'bed_bath_interaction'.
Created 'age_sqft_interaction'.

Finished numerical feature engineering. Total new features: 14

Start numerical feature engineering...
Created 'house_age'.
Created 'years_since_last_action' and 'was_renovated'.
Created 'sqft_living_per_floor'.
Created 'sqft_basement_ratio'.
Created 'sqft_living_lot_ratio'.
Created 'total_rooms_approx'.
Created 'log_sqft_living'.
Created 'log_sqft_lot'.
Created 'log_sqft_above'.
Created 'log_sqft_basement'.
Created 'sqft_living_sq'.
Created 'bed_bath_interaction'.
Created 'age_sqft_interaction'.

Finished numerical feature engineering. Total new feature

### 4. Review Engineered Features
- Inspect output first few rows (.head()) and the dimensions (.shape) of the resulting engineered features df.
- Check statistics for the new features ranges and identify any potential issues.

In [4]:
# --- Display Results ---
print("\n--- Engineered Numerical Features (Training Set Head) ---")
print(df_engineered_numerical_train.head())

print("\nShape of engineered numerical features dataframe:")
print(df_engineered_numerical_train.shape)

print("\nBasic stats of engineered features:")
print(df_engineered_numerical_train.describe())

# Check for any remaining NaNs
print("\nNaN check in engineered features:")
print(df_engineered_numerical_train.isnull().sum())


--- Engineered Numerical Features (Training Set Head) ---
   house_age  years_since_last_action  was_renovated  sqft_living_per_floor  \
0          8                        8              0                 1475.0   
1         39                       39              0                 1920.0   
2          7                        7              0                 1045.0   
3         17                       17              0                 1485.0   
4         16                        8              1                  765.0   

   sqft_basement_ratio  sqft_living_lot_ratio  total_rooms_approx  \
0             0.000000               0.287693                6.50   
1             0.218750               0.250489                6.75   
2             0.000000               0.402310                7.50   
3             0.212121               0.205026                8.50   
4             0.000000               0.441686                6.50   

   log_sqft_living  log_sqft_lot  log_sqft_above  l

In [13]:
# --- Display Results (Test Set) ---
print("\n--- Engineered Numerical Features (Test Set Head) ---")
print(df_engineered_numerical_test.head())

print("\nShape of engineered numerical features dataframe (Test):")
print(df_engineered_numerical_test.shape)

print("\nBasic stats of engineered features (Test):")
print(df_engineered_numerical_test.describe())

# Check for any remaining NaNs
print("\nNaN check in engineered features (Test):")
print(df_engineered_numerical_test.isnull().sum())



--- Engineered Numerical Features (Test Set Head) ---
   house_age  years_since_last_action  was_renovated  sqft_living_per_floor  \
0         13                       13              0            5420.000000   
1         70                       70              0             933.333333   
2         23                       23              0            1365.000000   
3         51                       51              0            1260.000000   
4         37                       37              0            2300.000000   

   sqft_basement_ratio  sqft_living_lot_ratio  total_rooms_approx  \
0             0.282288               0.053174                9.50   
1             0.000000               0.172840                5.00   
2             0.000000               0.222657                7.50   
3             0.000000               0.134817                5.75   
4             0.147826               0.196581                6.75   

   log_sqft_living  log_sqft_lot  log_sqft_above  log_s

### 5. Export Results/Outputs
- Define output path
- Save df

In [14]:
# --- Export Outputs for Member #4 ---
# Export engineered numerical features for training set
print("\nExporting Engineered Features")
engineered_output_path = r"./engineered_numerical_train.csv"

try:
    # Save the engineered numerical features training set
    df_engineered_numerical_train.to_csv(engineered_output_path, index=True)
    print(f"\nEngineered numerical features saved successfully to: {engineered_output_path}")
    print(f"Shape of saved engineered features: {df_engineered_numerical_train.shape}")
except Exception as e:
    print(f"Error saving engineered features file: {e}")
    
# Also export engineered numerical features for test set
print("\nExporting Engineered Features (Test Set)")
test_data_path = r"./test_data.csv"
engineered_test_output_path = r"./engineered_numerical_test.csv"

try:
    # Save the engineered numerical features test set
    df_engineered_numerical_test.to_csv(engineered_test_output_path, index=True)
    print(f"\nEngineered numerical features (test) saved successfully to: {engineered_test_output_path}")
    print(f"Shape of saved engineered features (test): {df_engineered_numerical_test.shape}")
except FileNotFoundError:
    print(f"Error: test_data.csv not found at {test_data_path}")
except Exception as e:
    print(f"Error saving engineered features file (test): {e}")

print("\n--- Part 2 Tasks Completed ---")
print(f"Deliverable files saved: {engineered_output_path} and {engineered_test_output_path}")
print("Deliverable code: The 'engineer_numerical_features' function definition (share this script).")


Exporting Engineered Features

Engineered numerical features saved successfully to: ./engineered_numerical_train.csv
Shape of saved engineered features: (3641, 14)

Exporting Engineered Features (Test Set)

Engineered numerical features (test) saved successfully to: ./engineered_numerical_test.csv
Shape of saved engineered features (test): (911, 14)

--- Part 2 Tasks Completed ---
Deliverable files saved: ./engineered_numerical_train.csv and ./engineered_numerical_test.csv
Deliverable code: The 'engineer_numerical_features' function definition (share this script).
