In [2]:
import pandas as pd
import os

# Corrected path handling for Jupyter Notebook (use os.getcwd() instead of __file__)
current_dir = os.getcwd()  # This is /Users/junshao/bootcamp_Jun_Shao/project/notebooks
base_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Up to /Users/junshao/bootcamp_Jun_Shao/project
data_processed_dir = os.path.join(base_dir, 'data', 'processed')

# Load preprocessed data
preprocessed_path = os.path.join(data_processed_dir, 'preprocessed_train.csv')
df = pd.read_csv(preprocessed_path)

# Define important columns (high correlation/domain knowledge)
important_columns = [
    'SalePrice',  # Target (normalized)
    # Numeric features (high corr >0.5 or key)
    'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
    'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
    '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 
    'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch',
    # Categorical (original + one-hot groups)
    'BldgType', 'HouseStyle', 'Foundation', 'BsmtQual', 'BsmtExposure', 'KitchenQual', 
    'GarageType', 'GarageFinish', 'SaleType', 'SaleCondition',
    'YrSold', 'MoSold',
    # All one-hot groups (Neighborhood, MSSubClass, MSZoning)
    *[col for col in df.columns if col.startswith('Neighborhood_')],
    *[col for col in df.columns if col.startswith('MSSubClass_')],
    *[col for col in df.columns if col.startswith('MSZoning_')]
]

# Select columns (handle if some missing)
available_columns = [col for col in important_columns if col in df.columns]
selected_df = df[available_columns]

# Save to new file in processed dir
selected_path = os.path.join(data_processed_dir, 'selected_train.csv')
selected_df.to_csv(selected_path, index=False)
print(f"Saved selected data to {selected_path}")
print(f"Original shape: {df.shape} -> Selected shape: {selected_df.shape}")
print(f"Selected columns: {list(selected_df.columns)}")

Saved selected data to /Users/junshao/bootcamp_Jun_Shao/project/data/processed/selected_train.csv
Original shape: (1460, 123) -> Selected shape: (1460, 87)
Selected columns: ['SalePrice', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'BldgType', 'HouseStyle', 'Foundation', 'BsmtQual', 'BsmtExposure', 'KitchenQual', 'GarageType', 'GarageFinish', 'SaleType', 'SaleCondition', 'YrSold', 'MoSold', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR'

In [3]:
import pandas as pd
import os
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Path handling for Jupyter Notebook
current_dir = os.getcwd()  # /Users/junshao/bootcamp_Jun_Shao/project/notebooks
base_dir = os.path.abspath(os.path.join(current_dir, '..'))  # /Users/junshao/bootcamp_Jun_Shao/project
data_processed_dir = os.path.join(base_dir, 'data', 'processed')

# Load preprocessed data (fallback if selected doesn't exist)
preprocessed_path = os.path.join(data_processed_dir, 'preprocessed_train.csv')
df = pd.read_csv(preprocessed_path)

# Define and select important columns
important_columns = [
    'SalePrice',
    'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
    'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
    '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 
    'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch',
    'BldgType', 'HouseStyle', 'Foundation', 'BsmtQual', 'BsmtExposure', 'KitchenQual', 
    'GarageType', 'GarageFinish', 'SaleType', 'SaleCondition',
    'YrSold', 'MoSold',
    *[col for col in df.columns if col.startswith('Neighborhood_')],
    *[col for col in df.columns if col.startswith('MSSubClass_')],
    *[col for col in df.columns if col.startswith('MSZoning_')]
]

available_columns = [col for col in important_columns if col in df.columns]
selected_df = df[available_columns]

# Add 'age' column: remodel interval
selected_df['age'] = selected_df['YearRemodAdd'] - selected_df['YearBuilt']

# Identify non-numeric columns
non_numeric_cols = selected_df.select_dtypes(include=['object']).columns.tolist()

# Ordinal encoding
ordinal_cols = ['BsmtQual', 'BsmtExposure', 'KitchenQual', 'GarageFinish']
ordinal_mappings = {
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0, None: 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0, None: 0},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, None: 0},
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0, None: 0}
}

for col in ordinal_cols:
    if col in selected_df.columns:
        selected_df[col] = selected_df[col].map(ordinal_mappings[col]).fillna(0)

# One-hot for nominal
nominal_cols = [col for col in non_numeric_cols if col not in ordinal_cols]
if nominal_cols:
    encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
    encoded_nominal = encoder.fit_transform(selected_df[nominal_cols])
    encoded_cols = encoder.get_feature_names_out(nominal_cols)
    encoded_df = pd.DataFrame(encoded_nominal, columns=encoded_cols, index=selected_df.index)
    selected_df = pd.concat([selected_df.drop(columns=nominal_cols), encoded_df], axis=1)

# Save encoded file
encoded_path = os.path.join(data_processed_dir, 'encoded_train.csv')
selected_df.to_csv(encoded_path, index=False)
print(f"Saved encoded data to {encoded_path}")
print(f"New shape: {selected_df.shape}")
print(selected_df.head())

Saved encoded data to /Users/junshao/bootcamp_Jun_Shao/project/data/processed/encoded_train.csv
New shape: (1460, 117)
   SalePrice  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
0   0.241078         65.0  0.033420     0.666667            5       2003   
1   0.203583         80.0  0.038795     0.555556            8       1976   
2   0.261908         68.0  0.046507     0.666667            5       2001   
3   0.145952         60.0  0.038561     0.666667            5       1915   
4   0.298709         84.0  0.060576     0.777778            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_ConLI  \
0          2003       196.0         706           0  ...             0.0   
1          1976         0.0         978           0  ...             0.0   
2          2002       162.0         486           0  ...             0.0   
3          1970         0.0         216           0  ...             0.0   
4          2000       350.0         655     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['age'] = selected_df['YearRemodAdd'] - selected_df['YearBuilt']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df[col] = selected_df[col].map(ordinal_mappings[col]).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df[col] = selected_df[col].map(ordinal_mappings

In [4]:
import pandas as pd
import os

# Path handling for Jupyter Notebook
current_dir = os.getcwd()  # /Users/junshao/bootcamp_Jun_Shao/project/notebooks
base_dir = os.path.abspath(os.path.join(current_dir, '..'))  # /Users/junshao/bootcamp_Jun_Shao/project
data_processed_dir = os.path.join(base_dir, 'data', 'processed')

# Load encoded data
encoded_path = os.path.join(data_processed_dir, 'encoded_train.csv')
if os.path.exists(encoded_path):
    df = pd.read_csv(encoded_path)
else:
    print("encoded_train.csv not found. Please generate it first from preprocessed_train.csv.")
    # Optional fallback: Load preprocessed and process minimally (comment out if not needed)
    # preprocessed_path = os.path.join(data_processed_dir, 'preprocessed_train.csv')
    # df = pd.read_csv(preprocessed_path)
    # ... (add previous encoding logic if needed)

# Ensure 'age' exists or calculate it (if year cols still present)
if 'age' not in df.columns and all(col in df.columns for col in ['YearRemodAdd', 'YearBuilt']):
    df['age'] = df['YearRemodAdd'] - df['YearBuilt']
elif 'age' not in df.columns:
    print("'age' not added: Required year columns missing.")

# Drop all year-related columns and MoSold
columns_to_drop = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'MoSold']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

# Save the updated file
updated_path = os.path.join(data_processed_dir, 'updated_encoded_train.csv')
df.to_csv(updated_path, index=False)
print(f"Saved updated encoded data to {updated_path}")
print(f"New shape: {df.shape}")
print(f"Dropped columns (if existed): {columns_to_drop}")
print(df.head())

Saved updated encoded data to /Users/junshao/bootcamp_Jun_Shao/project/data/processed/updated_encoded_train.csv
New shape: (1460, 112)
Dropped columns (if existed): ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'MoSold']
   SalePrice  LotFrontage   LotArea  OverallQual  OverallCond  MasVnrArea  \
0   0.241078         65.0  0.033420     0.666667            5       196.0   
1   0.203583         80.0  0.038795     0.555556            8         0.0   
2   0.261908         68.0  0.046507     0.666667            5       162.0   
3   0.145952         60.0  0.038561     0.666667            5         0.0   
4   0.298709         84.0  0.060576     0.777778            5       350.0   

   BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  ...  SaleType_ConLI  \
0         706           0        150          856  ...             0.0   
1         978           0        284         1262  ...             0.0   
2         486           0        434          920  ...             0.0   
3         2