In [None]:
# note: this model predicts the mean rental prices for 1 bedroom around US cities in December 2019 and this model is trained on months from 2018-5 till 2019-11
# so the latest price shoud be for November 2019 / and the std and the trend slope for the months from 2018-5 till 2019-11

In [None]:
import joblib

# Define filenames for saving
xgb_model_filename = 'xgb_rental_price_model.pkl'
kmeans_model_filename = 'kmeans_geo_model.pkl'
encoder_filename = 'target_encoder.pkl'

# Check if the models and encoder exist before saving
try:
    if 'XGBRentalPredectionPriceModelFor1Room' not in globals():
        raise NameError("XGBoost model 'XGBRentalPredectionPriceModelFor1Room' not found.")
    if 'kmeans' not in globals():
         raise NameError("KMeans model 'kmeans' not found.")
    if 'encoder' not in globals():
         raise NameError("Target encoder object 'encoder' not found.")

    # Save the trained models and encoder
    joblib.dump(XGBRentalPredectionPriceModelFor1Room, xgb_model_filename)
    joblib.dump(kmeans, kmeans_model_filename)
    joblib.dump(encoder, encoder_filename)


    print("Models and encoder saved successfully:")
    print(f"- XGBoost Model: {xgb_model_filename}")
    print(f"- KMeans Model: {kmeans_model_filename}")
    print(f"- Target Encoder: {encoder_filename}")

except NameError as e:
    print(f"ERROR: Cannot save. One or more required objects not found: {e}")
    print("Please ensure the cells that created XGBRentalPredectionPriceModelFor1Room, kmeans, and encoder were run.")
except Exception as e:
    print(f"An unexpected error occurred while saving: {e}")

Models and encoder saved successfully:
- XGBoost Model: xgb_rental_price_model.pkl
- KMeans Model: kmeans_geo_model.pkl
- Target Encoder: target_encoder.pkl


In [None]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from shapely.geometry import Point
import geopandas as gpd
from sklearn.cluster import KMeans
import xgboost as xgb
import joblib # Import joblib for loading
import category_encoders as ce # Import category_encoders

# --- Load Saved Models and Encoder ---
try:
    xgb_model = joblib.load('xgb_rental_price_model.pkl')
    print("XGBoost model loaded successfully.")
except FileNotFoundError:
    print("ERROR: XGBoost model file 'xgb_rental_price_model.pkl' not found.")
    print("Please ensure the model was saved and is in the current directory.")
    xgb_model = None # Set to None to indicate failure

try:
    kmeans = joblib.load('kmeans_geo_model.pkl')
    print("KMeans model loaded successfully.")
except FileNotFoundError:
    print("ERROR: KMeans model file 'kmeans_geo_model.pkl' not found.")
    print("Please ensure the model was saved and is in the current directory.")
    kmeans = None # Set to None to indicate failure

try:
    encoder = joblib.load('target_encoder.pkl')
    print("Target encoder loaded successfully.")
except FileNotFoundError:
    print("ERROR: Target encoder file 'target_encoder.pkl' not found.")
    print("Please ensure the encoder was saved and is in the current directory.")
    encoder = None # Set to None to indicate failure

# NOTE: Geographical data (us_coords, state_centers, county_centers, DC_COORD,
# coastline_boundary, STATE_CAPITALS) and X_train_final (for medians)
# are still accessed globally within the function. For a truly
# self-contained script, these would also need to be loaded or
# recalculated if necessary.

# --- Define the Prediction Function (Copied from cell a345bd76) ---

def predict_rental_price(df_input: pd.DataFrame,
                         target_encoder: ce.TargetEncoder,
                         distance_to_us_center: float,
                         distance_to_state_center: float,
                         distance_to_county_center: float,
                         dist_to_dc_miles: float,
                         dist_to_statecapital_miles: float,
                         dist_to_coast_miles: float,
                         std_dev: float = None,
                         trend_slope: float = None) -> float:
    """
    Takes a small DataFrame with new region information and required distances,
    engineers features, and predicts the rental price using the trained XGBoost model.

    Inputs:
    - df_input (pd.DataFrame): DataFrame containing a single row with the following columns:
                               'RegionName', 'State', 'CountyName', 'SizeRank',
                               'Latitude', 'Longitude', 'LATEST_PRICE_COL'.
    - target_encoder (ce.TargetEncoder): The fitted Target Encoder object.
    - distance_to_us_center (float): Distance to the geographical center of the US in miles.
    - distance_to_state_center (float): Distance to the geographical center of the State in miles.
    - distance_to_county_center (float): Distance to the geographical center of the County in miles.
    - dist_to_dc_miles (float): Distance to Washington D.C. in miles.
    - dist_to_statecapital_miles (float): Distance to the State Capital in miles.
    - dist_to_coast_miles (float): Distance to the nearest coast in miles.
    - std_dev (float, optional): The standard deviation of historical prices for the region.
                                 If None, the median from original_data is used.
    - trend_slope (float, optional): The trend slope of historical prices for the region.
                                     If None, the median from original_data is used.

    Outputs:
    - predicted_price (float): The predicted rental price for the region.
    """
    if len(df_input) != 1:
        raise ValueError("Input DataFrame must contain exactly one row for prediction.")

    df_processed = df_input.copy()

    # 1. Target Encoding: Apply the fitted encoder
    # Ensure the input DataFrame has the columns the encoder expects ('State', 'Metro', 'CountyName')
    cols_for_encoding = ['State', 'Metro', 'CountyName']
    if not all(col in df_processed.columns for col in cols_for_encoding):
        raise ValueError(f"Input DataFrame is missing required columns for encoding: {cols_for_encoding}")

    # Create a temporary DataFrame with columns renamed to lowercase for the encoder
    df_temp_for_encoding = df_processed[cols_for_encoding].rename(columns={col: col.lower() for col in cols_for_encoding})

    # Apply the encoder transformation to the temporary DataFrame
    df_encoded = target_encoder.transform(df_temp_for_encoding)

    # The encoded columns will have lowercase names ('state', 'metro', 'countyname')
    # Join the encoded features back to the processed DataFrame
    df_processed = df_processed.drop(columns=cols_for_encoding).join(df_encoded)


    # 2. Geographic Distance Features (Use provided inputs)
    df_processed['Distance_to_US_Center'] = distance_to_us_center
    df_processed['Distance_to_State_Center'] = distance_to_state_center
    df_processed['Distance_to_County_Center'] = distance_to_county_center
    df_processed['Dist_to_DC_miles'] = dist_to_dc_miles
    df_processed['Dist_to_StateCapital_miles'] = dist_to_statecapital_miles
    df_processed['Dist_to_Coast_miles'] = dist_to_coast_miles

    # Explicitly convert distance columns to float
    distance_cols = ['Distance_to_US_Center', 'Distance_to_State_Center', 'Distance_to_County_Center',
                     'Dist_to_DC_miles', 'Dist_to_StateCapital_miles', 'Dist_to_Coast_miles']
    for col in distance_cols:
        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')


    # 3. Time Features (Use provided values or median from X_train_final)
    # Check if 'Price_StdDev_19M' is in the input DataFrame
    if 'Price_StdDev_19M' in df_input.columns and pd.notna(df_input['Price_StdDev_19M'].iloc[0]):
        df_processed['Price_StdDev_19M'] = df_input['Price_StdDev_19M'].iloc[0]
    else:
         # Use the pre-calculated median if available
        if 'median_std_dev' in globals():
             df_processed['Price_StdDev_19M'] = median_std_dev
        else:
            print("Warning: 'Price_StdDev_19M' not provided and median not available. Using a default value.")
            df_processed['Price_StdDev_19M'] = 40.0 # Default fallback

    # Check if 'Price_Trend_Slope' is in the input DataFrame
    if 'Price_Trend_Slope' in df_input.columns and pd.notna(df_input['Price_Trend_Slope'].iloc[0]):
        df_processed['Price_Trend_Slope'] = df_input['Price_Trend_Slope'].iloc[0]
    else:
        # Use the pre-calculated median if available
        if 'median_trend_slope' in globals():
            df_processed['Price_Trend_Slope'] = median_trend_slope
        else:
            print("Warning: 'Price_Trend_Slope' not provided and median not available. Using a default value.")
            df_processed['Price_Trend_Slope'] = 0.0 # Default fallback


    # 4. geo_cluster_id (K-Means Nearest Neighbor Assignment)
    try:
        if kmeans is None:
             raise NameError("KMeans model 'kmeans' not loaded.")

        new_coords_for_kmeans = df_processed[['Longitude', 'Latitude']].values
        df_processed['geo_cluster_id'] = kmeans.predict(new_coords_for_kmeans)[0]
        # No longer convert to string here, keep as numerical for the model
        # df_processed['geo_cluster_id'] = df_processed['geo_cluster_id'].astype(str)

    except NameError:
         print("ERROR: KMeans model 'kmeans' not found or loaded. Cannot assign geo_cluster_id.")
         df_processed['geo_cluster_id'] = -1 # Default cluster ID (numerical)
    except Exception as e:
         print(f"Error assigning geo_cluster_id: {e}")
         df_processed['geo_cluster_id'] = -1


    # 5. Select Final Features in the EXACT order the model was trained on
    # Assuming X_train_final columns define the required order and names
    try:
        # Check if X_train_final is available globally
        if 'X_train_final' in globals():
            final_features = X_train_final.columns.tolist()
        else:
             # Fallback if X_train_final is not available
            print("Warning: X_train_final not found. Using a predefined list of final features. This may lead to errors if the order or names don't match the trained model.")
            # Note: These fallback names should match the encoded lowercase names expected by the model
            final_features = ['state', 'metro', 'countyname', 'SizeRank', 'Latitude', 'Longitude',
                              'Distance_to_US_Center', 'Distance_to_State_Center', 'Distance_to_County_Center',
                              'Dist_to_DC_miles', 'Dist_to_StateCapital_miles', 'Dist_to_Coast_miles',
                              'geo_cluster_id', 'Price_StdDev_19M', 'Price_Trend_Slope', 'LATEST_PRICE_COL']
    except Exception as e:
        print(f"Error determining final feature order: {e}")
        # Use the predefined list as a last resort
        final_features = ['state', 'metro', 'countyname', 'SizeRank', 'Latitude', 'Longitude',
                          'Distance_to_US_Center', 'Distance_to_State_Center', 'Distance_to_County_Center',
                          'Dist_to_DC_miles', 'Dist_to_StateCapital_miles', 'Dist_to_Coast_miles',
                          'geo_cluster_id', 'Price_StdDev_19M', 'Price_Trend_Slope', 'LATEST_PRICE_COL']


    # Ensure the output DataFrame has columns in the correct order and only includes final_features
    # Handle potential missing columns in df_processed before reindexing
    for col in final_features:
        if col not in df_processed.columns:
            print(f"Warning: Final feature '{col}' not found in processed DataFrame. Adding as NaN.")
            df_processed[col] = np.nan

    # Ensure all final features are numerical (float or int) except if treated as category
    # This step explicitly converts columns that might be 'object' to numerical, coercing errors to NaN
    # Note: Target encoded columns 'state', 'metro', 'countyname' and 'geo_cluster_id' should already be numerical after processing
    numerical_cols_to_check = ['state', 'metro', 'countyname', 'SizeRank', 'Latitude', 'Longitude',
                               'Distance_to_US_Center', 'Distance_to_State_Center', 'Distance_to_County_Center',
                               'Dist_to_DC_miles', 'Dist_to_StateCapital_miles', 'Dist_to_Coast_miles',
                               'geo_cluster_id', 'Price_StdDev_19M', 'Price_Trend_Slope', 'LATEST_PRICE_COL']
    for col in numerical_cols_to_check:
         if col in df_processed.columns and df_processed[col].dtype == 'object':
              df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')


    # 6. Make Prediction
    try:
        if xgb_model is None:
             raise NameError("XGBoost model not loaded.")

        # Ensure the order of columns in df_processed matches the order in final_features
        df_processed = df_processed[final_features]
        predicted_price_array = xgb_model.predict(df_processed)
        predicted_price = predicted_price_array[0]
        return predicted_price
    except NameError:
        print("ERROR: XGBoost model not loaded. Cannot make prediction.")
        return np.nan
    except Exception as e:
        print(f"Error during prediction: {e}")
        import traceback
        traceback.print_exc() # Print traceback for debugging
        return np.nan # Return NaN or a default value on prediction error


# --- Example Usage ---
# Define New Region Data (Indianapolis, IN)
new_city_data = pd.DataFrame({
    'RegionName': ['Indianapolis'],
    'State': ['IN'],
    'Metro': ['Indianapolis-Carmel-Anderson'], # Include Metro as it's needed for encoding
    'CountyName': ['Marion County'],
    'SizeRank': [15],
    'Latitude': [39.7684],
    'Longitude': [-86.1581],
    'LATEST_PRICE_COL': [1000] # The most recent known price for this region
    # Optional: Include 'Price_StdDev_19M' and 'Price_Trend_Slope' if you have them
    # 'Price_StdDev_19M': [50.0],
    # 'Price_Trend_Slope': [2.0]
})

# Example distances (you would get these from your search/calculation)
indy_distances = {
    'distance_to_us_center': 450.0, # Example value
    'distance_to_state_center': 10.0, # Example value
    'distance_to_county_center': 5.0, # Example value
    'dist_to_dc_miles': 580.0, # Example value
    'dist_to_statecapital_miles': 0.0, # Indianapolis is the capital
    'dist_to_coast_miles': 800.0 # Example value
}

# Check if models and encoder were loaded successfully before predicting
if xgb_model is not None and kmeans is not None and encoder is not None:
    # Make the prediction, passing the loaded encoder object and the required distances
    predicted_price = predict_rental_price(
        new_city_data.copy(),
        target_encoder=encoder, # Pass the encoder
        distance_to_us_center=indy_distances['distance_to_us_center'],
        distance_to_state_center=indy_distances['distance_to_state_center'],
        distance_to_county_center=indy_distances['distance_to_county_center'],
        dist_to_dc_miles=indy_distances['dist_to_dc_miles'],
        dist_to_statecapital_miles=indy_distances['dist_to_statecapital_miles'],
        dist_to_coast_miles=indy_distances['dist_to_coast_miles'],
        std_dev=None,        # Optional: Use median from original_data
        trend_slope=None     # Optional: Use median from original_data
    )

    # Display Result
    print("\n----------------------------------------------------------")
    print("           💰 Rental Price Prediction for Unseen Region 💰")
    print("----------------------------------------------------------")
    print(f"  Region (Input): {new_city_data['RegionName'].iloc[0]}, {new_city_data['State'].iloc[0]} ({new_city_data['CountyName'].iloc[0]})")
    print(f"  Predicted Rental Price (USD): ${predicted_price:.2f}")
    print("----------------------------------------------------------")
else:
    print("\nPrediction could not be made due to missing models or encoder.")

XGBoost model loaded successfully.
KMeans model loaded successfully.
Target encoder loaded successfully.
Using dummy encoder.

----------------------------------------------------------
           💰 Rental Price Prediction for Unseen Region 💰
----------------------------------------------------------
  Region (Input): Indianapolis, IN (Marion County)
  Predicted Rental Price (USD): $1119.13
----------------------------------------------------------


In [None]:
# this is the most suitable value we got if there is a suitable data for the required months to get the std, slope and the latest price correctly we can get a more accurate value for Indianapolis

In [None]:
# this idea of the project can be apllied to a higher time frame and wider geographical range if there is more suitable strong data available