In [4]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import json


In [5]:
def load_and_preprocess_data(file_path, drop_columns=[]):
    data = pd.read_csv(file_path, nrows=90000)
    data = data[data['purpose'] == "For Sale"]
    data.drop(columns=drop_columns, inplace=True)
    return data

In [6]:
def encode_data(data, mapping_file):
    with open(mapping_file, 'r') as file:
        loaded_json_mapping = json.load(file)
    inverted_mapping = {v: int(k) for k, v in loaded_json_mapping.items()}
    return data.map(inverted_mapping)

In [7]:
data= load_and_preprocess_data('Property_with_Feature_Engineering.csv', drop_columns=['property_id', 'location_id', 'page_url', 'locality', 'area', 'date_added', 'agency', 'agent', 'purpose'])
data.head()

Unnamed: 0,property_type,price,price_bin,location,city,province_name,latitude,longitude,baths,area_marla,area_sqft,bedrooms,year,month,day
0,House,220000000,Very High,Model Town,Lahore,Punjab,31.483869,74.325686,0,120.0,32670.12,0,2019,7,17
1,House,40000000,Very High,Multan Road,Lahore,Punjab,31.431593,74.17998,5,20.0,5445.02,5,2018,10,6
2,House,9500000,Low,Eden,Lahore,Punjab,31.499348,74.416959,0,9.0,2450.26,3,2019,7,3
3,House,125000000,Very High,Gulberg,Lahore,Punjab,31.522069,74.355512,7,20.0,5445.02,8,2019,4,4
4,House,21000000,High,Allama Iqbal Town,Lahore,Punjab,31.506483,74.286017,5,11.0,2994.76,6,2019,4,4


In [8]:
# Encode categorical data
# Define inflation rates for each year (from 2019 to 2024)
inflation_rates = [0.1058, 0.0974, 0.0950, 0.1987, 0.2918]  # Replace with actual inflation rates for each year
# Adjust prices in the dataset for inflation
for i, rate in enumerate(inflation_rates):
    data['price'] = data['price'] * (1 + rate)
for column in ['city', 'location', 'province_name', 'property_type', 'price_bin']:
    data[column] = encode_data(data[column], f'{column}_mapping.json')


In [9]:
# Selecting relevant features
features = ['price', 'area_marla', 'baths', 'bedrooms', 'area_sqft', 'location', 'city']  # Include 'city' in features
data_copy = data[features]
data_copy.head()


Unnamed: 0,price,area_marla,baths,bedrooms,area_sqft,location,city
0,452672500.0,120.0,0,0,32670.12,980,3
1,82304090.0,20.0,5,5,5445.02,1002,3
2,19547220.0,9.0,0,3,2450.26,430,3
3,257200300.0,20.0,7,8,5445.02,574,3
4,43209650.0,11.0,5,6,2994.76,120,3


In [10]:
data_copy.loc[:, 'price'] = data['price'].astype(int)
data_copy.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_copy.loc[:, 'price'] = data['price'].astype(int)
  data_copy.loc[:, 'price'] = data['price'].astype(int)


Unnamed: 0,price,area_marla,baths,bedrooms,area_sqft,location,city
0,452672515,120.0,0,0,32670.12,980,3
1,82304093,20.0,5,5,5445.02,1002,3
2,19547222,9.0,0,3,2450.26,430,3
3,257200293,20.0,7,8,5445.02,574,3
4,43209649,11.0,5,6,2994.76,120,3


In [11]:
# Split the data into features and target variable
X = data_copy[['baths','bedrooms','area_sqft','city','location']]  # Features
y = data_copy["price"]  # Target variable


In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Train the model (using Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=1)
model.fit(X_train, y_train)
#train_score = model.score(X_train, y_train)
#test_score = model.score(X_test, y_test)
#model.evaluate(X_test,y_test)

# print("Training R^2 score:", train_score)
# print("Testing R^2 score:", test_score)
#Make predictions
# y_pred = model.predict(X_test)

# # Calculate Mean Absolute Error
# mae = mean_absolute_error(y_test, y_pred)
# print("Mean Absolute Error:", mae)



[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   11.3s


In [14]:
# Save the trained model
joblib.dump(model, "housing_model.pkl")  # Save the model as "housing_model.pkl"


['housing_model.pkl']

In [15]:
res=model.predict(X_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s


In [16]:
print(res)

[9.13301035e+06 8.20434737e+06 1.25432179e+07 ... 2.21455230e+08
 5.24817193e+07 1.46301017e+07]


In [17]:
from sklearn.metrics import mean_absolute_error

In [18]:
print('Mean Absolute Error for Training Set:', mean_absolute_error(y_train, model.predict(X_train)))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.9s


Mean Absolute Error for Training Set: 7783102.762339554


In [19]:
city_mapping_file='city_mapping.json'
location_mapping_file='location_mapping.json'

def encode_location(location_name, mapping_file):
    with open(mapping_file, 'r') as file:
        mapping = json.load(file)
    for key, value in mapping.items():
        
        if value == location_name:
            return int(key)
    return None

def encode_city(city_name, mapping_file):
    with open(mapping_file, 'r') as file:
        mapping = json.load(file)
    for key, value in mapping.items():
        
        if value == city_name:
            return int(key)
    return None
    
# Creating a simple chatbot interface
def predict_house_prices(baths, bedrooms, area_sqft, city_name, location_name, budget, 
                         city_mapping_file, location_mapping_file):
    # Encode city and location names
    
    encoded_city=encode_city(city_name,city_mapping_file)
    encoded_location=encode_location(location_name,location_mapping_file)
    print(encoded_city)
    print(encoded_location)
    
    # Check if city and location were successfully encoded
    if encoded_city == -1 or encoded_location == -1:
        return "Error: City or Location not found in mapping"
    
    # Load the trained model
    model = joblib.load("housing_model.pkl")
    
    # Prepare input data
    input_data = pd.DataFrame({
        'baths': [baths],
        'bedrooms': [bedrooms],
        'area_sqft': [area_sqft],
        'city': [encoded_city],
        'location': [encoded_location]
    })
    
    # Predict the house price
    predicted_price = model.predict(input_data)[0]
   
    return predicted_price


In [20]:
def recommend_properties(budget, baths, bedrooms,city):
    # Filter properties based on conditions
    suitable_properties = data_copy[(data_copy["price"] <= budget) & (data_copy["baths"] >= baths) & (data_copy["bedrooms"] >= bedrooms)]

    # Get unique locations and cities for recommended properties
    unique_locations = suitable_properties['location'].unique()
    unique_cities = suitable_properties['city'].unique()

    return unique_locations, unique_cities



In [21]:
# Example usage of the chatbot
input_str = input("Please input baths, bedrooms, area_sqft, city, location and budget (in rupees), separated by commas (e.g., '5,5,2000,Lahore,location, 20000000'): ")
baths, bedrooms, area_sqft,city, location, budget = input_str.split(',')
baths = int(baths)
bedrooms = int(bedrooms)
area_sqft = float(area_sqft)
budget = int(budget)

predicted_price = predict_house_prices(baths, bedrooms, area_sqft, city,location, budget, city_mapping_file,location_mapping_file)
print(f"Predicted price for the property: {predicted_price}")

# recommended_locations, recommended_cities = recommend_properties(budget, baths, bedrooms,city)
# print("\nRecommended Locations:")
# print(recommended_locations)
# print("\nRecommended Cities:")
# print(recommended_cities)

3
762
Predicted price for the property: 33669003.88916883


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [22]:
mapping_file = 'location_mapping.json'  # Path to your mapping file
def decode_locations(encoded_locations, mapping_file):
    with open(mapping_file, 'r') as file:
        mapping = json.load(file)
    decoded_locations = [mapping[str(encoded)] for encoded in encoded_locations]
    return decoded_locations

def suggest_preferred_locations(budget, baths, bedrooms, city_name, location_name,
                                city_mapping_file, location_mapping_file, data_copy):
    # Encode city and location names
    encoded_city = encode_city(city_name, city_mapping_file)
    # print(encoded_city)
    encoded_location = encode_location(location_name, location_mapping_file)
    
    # Check if city and location were successfully encoded
    if encoded_city is None or encoded_location is None:
        return "Error: City or Location not found in mapping"
    
    # Filter properties based on conditions
    suitable_properties = data_copy[(data_copy["price"] < budget) & 
                                    (data_copy["baths"] >= baths) &
                                    (data_copy["bedrooms"] >= bedrooms) & 
                                    (data_copy["city"] == encoded_city) &
                                    (data_copy["area_sqft"] >= area_sqft)]


    # Get unique locations for recommended properties
    unique_locations = suitable_properties['location'].unique()
    unique_locations = unique_locations[:10]
    decoded_locations = decode_locations(unique_locations, location_mapping_file)
    return decoded_locations

# Example usage of the function
budget = 20000000
baths = 5
bedrooms = 5
city_name = "Lahore"  # Example city name
location_name = "Gulberg"  # Example location name
city_mapping_file = 'city_mapping.json'  # Path to your city mapping file
location_mapping_file = 'location_mapping.json'  # Path to your location mapping file

preferred_locations = suggest_preferred_locations(budget, baths, bedrooms, city_name, location_name,
                                                  city_mapping_file, location_mapping_file, data_copy)

print("Preferred Locations:")
print(preferred_locations)

            price  area_marla  baths  bedrooms  area_sqft  location  city
4361     15843538        10.0      7         5    2722.51       876     3
6591      7685144        10.0      6         5    2722.51       876     3
7772     10288011        10.0      6         5    2722.51      1466     3
8776     15637777        10.0      6         5    2722.51       355     3
10014           8        20.0      6         5    5445.02      1501     3
10077    17489619         8.0     10        11    2178.01       194     3
65115    19547222        10.0      7         7    2722.51      1337     3
72863 -2147483648       260.0      9         8   70785.26       574     3
81536    18518421        10.0      5         5    2722.51       871     3
85092    19547222         8.0      7         5    2178.01       204     3
86164    19547222         8.0      7         5    2178.01       204     3
86165    19547222         8.0      7         5    2178.01       204     3
Preferred Locations:
['Lake City', 'To