In [199]:
import pandas as pd
import numpy as np

In [200]:
df = pd.read_csv('data.csv')

In [201]:
def convert_to_numeric(value):
    try:
        return pd.to_numeric(value)
    except:
        return None
df['Beds'] = df['Beds'].apply(convert_to_numeric)
df['Baths'] = df['Baths'].apply(convert_to_numeric)
df.dropna(inplace=True)
df['Beds'] = df['Beds'].astype('int64')
df['Baths'] = df['Baths'].astype('int64')

In [202]:
pattern = r'PKR([\d.]+) (\w+)'
extracted_data = df['Price'].str.extract(pattern)
extracted_data.columns = ['Numeric', 'Unit']

def convert_value(row):
    numeric = float(row['Numeric'])
    unit = row['Unit']
    if unit == 'Crore':
        return numeric * 10000000  # Crore to actual value
    elif unit == 'Lakh':
        return numeric * 100000    # Lakh to actual value
    elif unit == 'Arab':
        return numeric * 1000000000 
    else:
        return numeric

extracted_data['Converted'] = extracted_data.apply(convert_value, axis=1)

df['Price'] = extracted_data['Converted']
df['Price'] = df['Price'].astype('int64')

In [203]:
df[['Area Number', 'Area Unit']] = df['Area'].str.extract(r'(\d+) (\w+)')
df['Area Number'] = df['Area Number'].astype('int64')

In [204]:
df = df.drop(columns='Area')
df = df.reset_index(drop=True)

In [205]:
df['Rooms'] = df['Baths'] + df['Beds'] + df['Dining Room'] + df['Laundry Room'] + df['Store Rooms'] + df['Kitchens'] + df['Drawing Room'] + df['Gym'] + df['Powder Room'] + df['Steam Room'] + df['No additional rooms'] + df['Prayer Rooms'] + df['Lounge or Sitting Room']
df = df.drop(columns=['Baths','Beds','Dining Room','Laundry Room','Store Rooms','Kitchens','Drawing Room','Gym','Powder Room','Steam Room','No additional rooms','Prayer Rooms','Lounge or Sitting Room'])

In [206]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Area Unit'] = le.fit_transform(df['Area Unit'])

In [207]:
df['Location_ID'] = pd.factorize(df['Location'])[0]
locations = pd.DataFrame({'Location': df['Location'], 'Location_ID': df['Location_ID']})

In [208]:
X = df.iloc[:,2:].values
y = df.iloc[:,1].values

In [209]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [210]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE is the square root of MSE
r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 16474259.103304736
Mean Squared Error: 2267748017463384.0
Root Mean Squared Error: 47620877.957712874
R-squared: 0.8087457055330998


In [211]:
def predict_house_price(data):
    return regressor.predict(data)
predict_house_price([[1,0,11,2]])

array([92185609.55998024])

In [212]:
import pickle
pickle.dump(regressor,open('Regressor.pkl','wb'))
pickle.dump(locations,open('Location.pkl','wb'))