### Regression on the whole dataset all variables

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
hdb_df = pd.read_csv('data/hdb_combined_clean.csv', low_memory=False)

In [4]:
hdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917393 entries, 0 to 917392
Data columns (total 22 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   month_of_sale                 917393 non-null  object 
 1   town                          917393 non-null  object 
 2   flat_type                     917393 non-null  object 
 3   block                         917393 non-null  object 
 4   street_name                   917393 non-null  object 
 5   storey_range                  917393 non-null  object 
 6   floor_area_sqm                917393 non-null  float64
 7   flat_model                    917393 non-null  object 
 8   lease_commence_date           917393 non-null  int64  
 9   resale_price                  917393 non-null  float64
 10  address                       917393 non-null  object 
 11  latitude                      917393 non-null  float64
 12  longitude                     917393 non-nul

In [20]:
# Function to assign weights based on frequency counts
def assign_weights(df, column):
    counts = df[column].value_counts()
    sorted_items = counts.index
    weights = {item: i + 1 for i, item in enumerate(sorted_items)}
    return df[column].map(weights)

# Assign weights for 'town', 'flat_type', and 'flat_model'
hdb_df['town_weight'] = assign_weights(hdb_df, 'town')
hdb_df['flat_type_weight'] = assign_weights(hdb_df, 'flat_type')
hdb_df['flat_model_weight'] = assign_weights(hdb_df, 'flat_model')

In [21]:
hdb_df.drop(['month_of_sale', 'block', 'street_name', 'storey_range', 'lease_commence_date', 'address',
             'latitude', 'longitude','town', 'flat_type', 'flat_model', 'price_per_sqm'],
            axis=1, inplace=True)

In [22]:
hdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917393 entries, 0 to 917392
Data columns (total 13 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   floor_area_sqm                917393 non-null  float64
 1   resale_price                  917393 non-null  float64
 2   nearest_supermarket_distance  917393 non-null  float64
 3   nearest_school_distance       917393 non-null  float64
 4   nearest_mrt_distance          917393 non-null  float64
 5   nearest_hawkers_distance      917393 non-null  float64
 6   cbd_distance                  917393 non-null  float64
 7   year_of_sale                  917393 non-null  int64  
 8   calculated_remaining_lease    917393 non-null  float64
 9   storey_median                 917393 non-null  int64  
 10  town_weight                   917393 non-null  int64  
 11  flat_type_weight              917393 non-null  int64  
 12  flat_model_weight             917393 non-nul

In [23]:
# Split features and target (assuming 'resale_price' is the target)
X = hdb_df.drop(columns=['resale_price'])
y = hdb_df['resale_price']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features (only for models sensitive to scale like linear regression or neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [24]:
# hdb_df
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print("Linear Regression:")
print("R² score:", r2_score(y_test, y_pred_lr))
print("RMSE:", rmse)


Linear Regression:
R² score: 0.819761837414274
RMSE: 71370.27840549395


In [25]:
model_file_name = "lr_best_model.joblib"
model_folder = "models/"
joblib.dump(lr_model, model_folder+''+model_file_name)

['models/lr_best_model.joblib']

In [27]:
loaded_lr_model = joblib.load(open(model_folder + model_file_name, 'rb'))

# Load the original dataset to create encoders (if necessary)
original_data = pd.read_csv("data/hdb_combined_clean.csv")  # Load your training data

# Define a function to preprocess input data for prediction
def preprocess_input(town, lease_commence_date, flat_type, flat_model, storey_median):
    # Create a DataFrame for the input data
    input_data = pd.DataFrame({
        'town': [town],
        'lease_commence_date': [lease_commence_date],
        'flat_type': [flat_type],
        'flat_model': [flat_model],
        'storey_median': [storey_median]
    })

    # Perform encoding (one-hot encoding for categorical variables)
    input_data_encoded = pd.get_dummies(input_data, columns=['town', 'flat_type', 'flat_model'], drop_first=True)

    # Ensure all necessary columns are present
    for col in original_data.columns:
        if col not in input_data_encoded.columns:
            input_data_encoded[col] = 0  # Add missing columns with value 0

    # Ensure the order of columns matches the model training
    input_data_encoded = input_data_encoded.reindex(columns=original_data.columns, fill_value=0)

    return input_data_encoded.values  # Convert to a NumPy array

def make_prediction(town, lease_commence_date, flat_type, flat_model, storey_median):
    # Preprocess the input features
    input_features = preprocess_input(town, lease_commence_date, flat_type, flat_model, storey_median)

    # Make the prediction using the loaded linear regression model
    predicted_price = loaded_lr_model.predict(input_features)

    # Format the result for output
    final_info = "The predicted resale price is: ${:,.2f}".format(predicted_price[0])
    
    return final_info

# Example input
town_example = "Ang Mo Kio"
lease_commence_date_example = 2000  # Example lease commence date (year)
flat_type_example = "3-room"
flat_model_example = "Model A"  # Replace with an actual model type
storey_median_example = 5  # Example storey median

# Call the function and print the result
print(make_prediction(town_example, lease_commence_date_example, flat_type_example, flat_model_example, storey_median_example))


ValueError: X has 22 features, but LinearRegression is expecting 12 features as input.

In [17]:
# !pip3 install gradio

Defaulting to user installation because normal site-packages is not writeable
Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl (18.1 MB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting fastapi<1.0
  Downloading fastapi-0.115.0-py3-none-any.whl (94 kB)


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.4 requires pathlib, which is not installed.
anaconda-project 0.10.2 requires ruamel-yaml, which is not installed.
weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.5 which is incompatible.
spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.5 which is incompatible.
botocore 1.24.32 requires urllib3<1.27,>=1.25.4, but you have urllib3 2.2.3 which is incompatible.


Collecting uvicorn>=0.14.0
  Downloading uvicorn-0.31.0-py3-none-any.whl (63 kB)
Collecting orjson~=3.0
  Downloading orjson-3.10.7-cp39-none-win_amd64.whl (137 kB)
Collecting importlib-resources<7.0,>=1.3
  Downloading importlib_resources-6.4.5-py3-none-any.whl (36 kB)
Collecting ffmpy
  Downloading ffmpy-0.4.0-py3-none-any.whl (5.8 kB)
Collecting gradio-client==1.3.0
  Downloading gradio_client-1.3.0-py3-none-any.whl (318 kB)
Collecting httpx>=0.24.1
  Downloading httpx-0.27.2-py3-none-any.whl (76 kB)
Collecting urllib3~=2.0
  Downloading urllib3-2.2.3-py3-none-any.whl (126 kB)
Collecting aiofiles<24.0,>=22.0
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting semantic-version~=2.0
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting tomlkit==0.12.0
  Downloading tomlkit-0.12.0-py3-none-any.whl (37 kB)
Collecting typer<1.0,>=0.12
  Downloading typer-0.12.5-py3-none-any.whl (47 kB)
Collecting ruff>=0.2.2
  Downloading ruff-0.6.8-py3-none-win_amd6

In [18]:
import gradio as gr

In [None]:
import gradio as gr

# Define the function to be called by Gradio
def make_prediction(town, flat_type, floor_area):
    input_features = preprocess_input(town, flat_type, floor_area)
    predicted_price = loaded_lr_model.predict(input_features)
    final_info = "The predicted resale price is: ${:,.2f}".format(predicted_price[0])
    return final_info

# Set the headline
headline = "Predict Resale Price"

# Create the Gradio interface
iface = gr.Interface(
    fn=make_prediction,
    inputs=[
        gr.inputs.Dropdown(choices=["Ang Mo Kio", "Bedok", "Bukit Batok", "Choa Chu Kang"], label="Select Town"),
        gr.inputs.Dropdown(choices=["1-room", "2-room", "3-room", "4-room", "5-room"], label="Select Flat Type"),
        gr.inputs.Slider(minimum=40, maximum=200, label="Floor Area (in sqm)", step=1)  # Adjust min/max based on your dataset
    ],
    outputs="text",
    title=headline
)

# Launch the interface
iface.launch(share=True)
