### Without pipelines

In [None]:
# Categoricals to be encoded
    # Already encoded: bedrooms, bathrooms, floors, grade, condition
    # These have already been encoded but may need to be adjusted to 0-based encoding if it provides more consistency across the categorical variables or simply performs better with sklearn's algorithms
    # Many of them will need to be transformed so that floats are rounded up for simplicity. I.e., 1.5 bathrooms represents 1 bathroom and an en suite bathroom but this can be considered as 2 bathrooms
    # In the case of floors, 1.5 floors may be one floor and a loft but this can be rounded down to 1 floor, perhaps.
    
# Numeric: I Assume the rest will be numeric and simply needs to be scaled. I am using linear regression and this algorithm will utilise gradient decent to optimise parameter estimates. 
# Continous features that have varying magnitudes need to be scaled to avoid causing issues (such as much slower convergence rates or leading to suboptimal solutions. I don't think suboptimal solutions is as much of an issue here as LR's loss func is convex. 
# Do I need to scale the target variable?

In [None]:
# Checking the distribution of bedrooms
price_df['bedrooms'].value_counts()

In [None]:
# Any instances that have less than 1 bedroom or more than 6 bedrooms will be removed
# upper limit set by ~3 standard deviations above the mean

rooms_3_std = 3 * np.std(price_df['bedrooms'])
rooms_3_std_above = price_df['bedrooms'].mean() + rooms_3_std


print(f"Houses that have more than {round(rooms_3_std_above)} rooms are 3 standard deviations above the mean and will be removed.")
price_df = price_df[price_df['bedrooms'] <= 6]

In [None]:
# Checking the distribution of bathrooms
price_df['bathrooms'].value_counts()

In [None]:
# All values for bathrooms will be rounded down as decimal values represent rooms with a toilet but no shower.
# Any instances that have less than 1 bathroom or more than 5 bathrooms will be removed
# Upper limit set by ~3 standard deviations above the mean

price_df['bathrooms'] = price_df['bathrooms'].round().astype(int)
price_df.head()

bath_3_std = 3 * np.std(price_df['bathrooms'])
bath_3_std_above = price_df['bathrooms'].mean() + rooms_3_std

print(f"Houses that have more than {round(bath_3_std_above)} bathrooms are 3 standard deviations above the mean and will be removed.")
price_df = price_df[price_df['bathrooms'] <= 5]

In [None]:
# Checking the distribution of floors
price_df['floors'].value_counts()

In [None]:
# As homes with .5 floors are defined as split-level or bi-level floors they will be rounded up for simplicity
price_df['floors'] = price_df['floors'].round().astype(int)

In [None]:
# Checking the distribution of grade (no changes will be made)
price_df['grade'].value_counts()

In [None]:
# Checking the distribution of condition (no changes will be made)
price_df['condition'].value_counts()

In [None]:
print(price_df.shape)
price_df.head()

In [None]:
def reorder_col(data, col_name, after):
    '''
    Function will take in a dataframe and move a column specified by name to another position in the dataframe after any other column also specified by name.
    I will use this after creating new features and would like to quickly place them elsewhere in the existing dataframe - usually just after the features they have been engineered from.
    
    Return: The input dataframe with new column order
    '''
    
    col_values = data.pop(col_name) #remove the column
    position = data.columns.get_loc(after) #Index location of the column I want to insert the new column after
    data.insert(position+1, col_name, col_values) #Insert after the specified column
    
    return data

In [None]:
# yr_built converted into 'age' (at the time the dataset was produced? 2015)
# The primary use of knowing the year a home was built is knowing how old it is. This can be captured by calculating the age. 
# I think it is more suitable to represent this as age instead of a measure of time as there is no time component to this analysis/model.
# Therefor I believe it makes more sense to measure these variables as a count in years instead of an instance in time.
'''
Scaling the Year Built
Scaling the year a house was built can be done, but it might not always provide meaningful insights. For instance, the difference between the years 2000 and 2020 
is significant for understanding a house's condition, but the raw "year" values (e.g., 1800 vs. 2000) don't directly convey age-related insights unless interpreted 
relative to the present year. Moreover, scaled years (e.g., "0.85" or "-1.2" after standardization) are abstract and may confuse the model if the relationship to 
the target variable is better understood through "age."

Scaling the Age of the House
Scaling the age of the house (i.e., current year minus year built) is generally more meaningful. Age provides a clearer representation of a house's potential condition, 
depreciation, or relevance in the market. For example, a house built 20 years ago (age 20) versus 50 years ago (age 50) offers a direct interpretation of how old the 
property is, which is often more relevant for the target variable (e.g., price).

If the problem specifically involves historical trends or if there is a time series component to the analysis then this feature would be more suitable, probably once converted
into a datetime object.
'''


price_df['age'] = 2015 - price_df['yr_built']
reorder_col(data=price_df, col_name='age', after='yr_built') #Reorder the columns so age immediately follows yr_built

price_df.head()

In [None]:
def renovated_within(data, yrs_within):
    '''
    this function will take 2 arguements: 
        1. The first is the dataframe.
        2. The second will specify a threshold for the number of years that have passed since the house was last renovated (relative to 2015).
    
    If the house has been renovated within the specified number of years before 2015 the result will be a 1, otherwise it will be a 0.
    All homes that have not been renovated at all will result in a 0.
    
    The resulting output will be a new dataframe with an added binary feature specifying whether the house had been renovated 
    within that specified number of years.
    '''
    
    renovated_list = []
    for year in data['yr_renovated']:
        if year != 0: #If the house has been renovated then calculate how long ago
            yrs_since_renov = 2015-year
            
            if yrs_since_renov <= yrs_within: #If renovations occured within specified time frame then 1, representing yes
                renovated_list.append(1)
            else:
                renovated_list.append(0)
        
        else: #If the house has not been renovated at all then 0, representing no
            renovated_list.append(0)
    
    # Add new column to the dataframe
    col_name = f"renovated_within_{yrs_within}_yrs"
    data[col_name] = renovated_list
    
    return data

In [None]:
# For more simplicity so that I don't have to deal with defining the difference between a house that had been renovated that year and one that 
# had never been renovated I may just convert this feature into a binary variable akin to 'renovated in the past 10 years' & 'past 5 years'

# Create new binary columns that specify whether the house has been renovated within 10 years and 5 years, respectively.
renovated_within(data=price_df, yrs_within=10)
renovated_within(data=price_df, yrs_within=5)

# Reorder columns such that the new columns above immediately follow the 'yr_renovated' column
reorder_col(data=price_df, col_name='renovated_within_10_yrs', after='yr_renovated')
reorder_col(data=price_df, col_name='renovated_within_5_yrs', after='renovated_within_10_yrs')

price_df.head()

In [None]:
# Scale continuous variables with z-score standardisation
# sqft_living, sqft_lot, sqft_above, sqft_basement, age, lat, long, sqft_living15, sqft_lot15

scaler = StandardScaler()

cols_to_scale = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'age', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
price_df[cols_to_scale] = scaler.fit_transform(price_df[cols_to_scale])

In [None]:
price_df.head()