In [32]:
#PreProcess Step
#TODO: While are NA values being represented when one hot encoding
#TODO: Deal with numerical values

import pandas as pd
file_path = 'train.csv'
data = pd.read_csv(file_path)

# Replacing missing values for numeric div columns with 0 (might consider using the median or mean)
numeric_cols = ['LotFrontage', 'LotArea']
data[numeric_cols] = data[numeric_cols].fillna(0)  # Adjust based on what's appropriate

# Replaces NA with nulls
data = data.fillna('')

# Feature Engineering
data['TotalSF'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF']
data['HouseAge'] = data['YrSold'] - data['YearBuilt']
data['RemodelAge'] = data['YrSold'] - data['YearRemodAdd']
data['HasBasement'] = data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
data['TotalBath'] = data['FullBath'] + (0.5 * data['HalfBath']) + data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath'])
data['OverallScore'] = data['OverallQual'] + data['OverallCond']
data['LotFrontageRatio'] = data['LotFrontage'] / data['LotArea']  # Ensure no division by zero


def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

data['SaleSeason'] = data['MoSold'].apply(get_season)

# Price per SF
data['PricePerSF'] = data['SalePrice'] / data['TotalSF']


# Drop the 'Id' column in-place
data.drop('Id', axis=1, inplace=True)


#Filling Empty Conditions per structure of data
#The columns where you want to fill missing values
columns_to_fill_na = [
    'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
    'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
    'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
    'MasVnrType', 'Electrical'
]

# Fill missing values with 'NA' only in the specified columns
data[columns_to_fill_na] = data[columns_to_fill_na].fillna('NA')

#Additonaly PreProcessing to get categorical data to binary
data = pd.get_dummies(data, columns=[
    'Alley', 'MSZoning', 'Street', 'LotShape',
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'Foundation', 'Heating', 'CentralAir',
    'Functional', 'GarageType', 'GarageFinish', 'PavedDrive',
    'MiscFeature', 'SaleType', 'SaleCondition', 'SaleSeason',
    'MasVnrType', 'Electrical'
])

# Label Encoding: Simplify Quality and Condition Ratings
# May not be suitable when dealing with neural networks?
# Blocking out and will test, will transform into one hot encoding
""""
quality_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
data['ExterQual'] = data['ExterQual'].map(quality_map)
data['BsmtQual'] = data['BsmtQual'].map(quality_map)
data['ExterCond'] = data['ExterCond'].map(quality_map)
#'Heating', 'HeatingQC',

"""


data = pd.get_dummies(data, columns=[
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
    'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC',
    'Fence'
])

#Handling Numerical Data:
#Garage Year built has no data


output_file_path_features = 'train_with_features_Gus.csv'
data.to_csv(output_file_path_features, index=False)