In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# For XGBoost (make sure xgboost is installed: pip install xgboost)
import xgboost as xgb

# For the interactive dashboard (requires streamlit: pip install streamlit)
import streamlit as st

# Load the dataset (adjust the path if necessary)
data_path = 'house_rentals.csv'
df = pd.read_csv(data_path)

# Display the first few rows
print("Dataset Preview:")
print(df.head())

# Data Exploration and Preprocessing
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())


# Data Cleaning
# Drop rows with missing rental price
df = df.dropna(subset=['price'])

# For simplicity, fill missing values in categorical features with 'Unknown'
categorical_cols = ['category', 'condition', 'amenities', 'region', 'locality', 'parking_space', 'is_furnished']
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

# Optionally, fill numeric columns if needed
numeric_cols = ['bedrooms', 'bathrooms', 'floor_area', 'lat', 'lng']
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

print("After cleaning, missing values:")
print(df.isnull().sum())


# 3. Exploratory Data Analysis (EDA)

plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of Rental Prices (GHS)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


plt.figure(figsize=(12, 8))
numeric_features = ['price', 'bedrooms', 'bathrooms', 'floor_area', 'lat', 'lng']
sns.heatmap(df[numeric_features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


# 4. Feature Engineering
# Define features and target
X = df.drop(columns=['url', 'name', 'price'])
y = df['price']

# Identify categorical and numerical columns
cat_features = ['category', 'condition', 'amenities', 'region', 'locality', 'parking_space', 'is_furnished']
num_features = [col for col in X.columns if col not in cat_features]

# Create preprocessor for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LINEAR REGRESSION PIPLELINE
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

# Evaluation
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression - RMSE: {:.2f}, R2: {:.2f}".format(rmse_lr, r2_lr))


rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Use GridSearchCV for hyperparameter tuning (optional)
rf_param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [None, 10, 20]
}

rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=3, scoring='neg_mean_squared_error')
rf_grid.fit(X_train, y_train)
y_pred_rf = rf_grid.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest - Best Params:", rf_grid.best_params_)
print("Random Forest - RMSE: {:.2f}, R2: {:.2f}".format(rmse_rf, r2_rf))


# XGBOOST PIPLELINE
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

xgb_param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [3, 6, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=3, scoring='neg_mean_squared_error')
xgb_grid.fit(X_train, y_train)
y_pred_xgb = xgb_grid.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)
print("XGBoost - Best Params:", xgb_grid.best_params_)
print("XGBoost - RMSE: {:.2f}, R2: {:.2f}".format(rmse_xgb, r2_xgb))


# 6. Key Factors Influencing Rental Prices
# To understand the most important factors, we can look at feature importances from tree-based models. For example, using the Random Forest model:

# Retrieve the preprocessed training data for feature importance
X_train_transformed = preprocessor.fit_transform(X_train)

# Fit a Random Forest model on transformed data
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_transformed, y_train)

# Get feature names from the preprocessor
ohe = rf_grid.best_estimator_.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
cat_feature_names = ohe.get_feature_names_out(cat_features)
all_feature_names = num_features + list(cat_feature_names)

# Create a DataFrame for feature importances
feat_importances = pd.DataFrame({
    'feature': all_feature_names,
    'importance': rf_model.feature_importances_
}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feat_importances.head(20))
plt.title('Top 20 Feature Importances from Random Forest')
plt.show()




XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Applications/anaconda3/envs/AI_Class/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: '@rpath/libomp.dylib'\n  Referenced from: '/Applications/anaconda3/envs/AI_Class/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib'\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/Applications/anaconda3/envs/AI_Class/lib/python3.12/lib-dynload/../../libomp.dylib' (no such file), '/Applications/anaconda3/envs/AI_Class/bin/../lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]


In [None]:

# We use Streamlit to create a dashboard for stakeholders to input parameters and get rental price predictions. To run the dashboard, save this section in a separate Python file (e.g., `app.py`) and run `streamlit run app.py`.

import streamlit as st
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import Pipeline

# Load the trained model pipeline (ensure you have saved one of the best performing pipelines, e.g., XGBoost)
# For demonstration, we will assume the xgb_grid.best_estimator_ is saved to a file 'xgb_pipeline.pkl'
with open('xgb_pipeline.pkl', 'rb') as f:
    model_pipeline = pickle.load(f)

st.title("Ghana Rental Price Predictor")

st.sidebar.header("Input Property Details")

# Define user input fields (customize based on available features)
category = st.sidebar.selectbox("Property Category", ['apartment', 'house', 'room', 'office', 'Unknown'])
condition = st.sidebar.selectbox("Condition", ['new', 'used', 'off-plan', 'Unknown'])
amenities = st.sidebar.text_input("Amenities (comma separated)", "wifi, water")
region = st.sidebar.selectbox("Region", df['region'].unique())
locality = st.sidebar.selectbox("Locality", df['locality'].unique())
parking_space = st.sidebar.selectbox("Parking Space", ['Yes', 'No', 'Unknown'])
is_furnished = st.sidebar.selectbox("Is Furnished", ['Yes', 'No', 'Unknown'])

bedrooms = st.sidebar.number_input("Number of Bedrooms", min_value=0, value=1)
bathrooms = st.sidebar.number_input("Number of Bathrooms", min_value=0, value=1)
floor_area = st.sidebar.number_input("Floor Area (sq meters)", min_value=0.0, value=50.0)
lat = st.sidebar.number_input("Latitude", value=5.6037)
lng = st.sidebar.number_input("Longitude", value=-0.1870)

# Create a DataFrame from user inputs
input_data = pd.DataFrame({
    'category': [category],
    'condition': [condition],
    'amenities': [amenities],
    'region': [region],
    'locality': [locality],
    'parking_space': [parking_space],
    'is_furnished': [is_furnished],
    'bedrooms': [bedrooms],
    'bathrooms': [bathrooms],
    'floor_area': [floor_area],
    'lat': [lat],
    'lng': [lng]
})

if st.sidebar.button("Predict Rental Price"):
    prediction = model_pipeline.predict(input_data)
    st.write("Estimated Rental Price (GHS):")
    st.write(np.round(prediction[0], 2))

import pickle
with open('xgb_pipeline.pkl', 'wb') as f:
    pickle.dump(xgb_grid.best_estimator_, f)



