# Import necessary libraries


In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
import joblib

# Load your dataset and data cleaning

In [2]:
df = pd.read_csv('housePrice.csv')

# Round the 'Price' column to the nearest integer
df['Price(USD)'] = df['Price(USD)'].round(0)

# Data Cleaning
df["Area"] = df["Area"].apply(lambda x: float(x.split()[0].replace(',', '')))
df = df.drop([709, 1604, 570, 2802])  # Drop specific rows
df = df.dropna()
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63.0,1,True,True,True,Shahran,1850000000,61667.0
1,60.0,1,True,True,True,Shahran,1850000000,61667.0
2,79.0,2,True,True,True,Pardis,550000000,18333.0
3,95.0,2,True,True,True,Shahrake Qods,902500000,30083.0
4,123.0,2,True,True,True,Shahrake Gharb,7000000000,233333.0


# Work on data

In [3]:
label_encoder = LabelEncoder()
df['Original_Address'] = df['Address']  # Save original 'Address' values
df['Address'] = label_encoder.fit_transform(df['Address'])

# Convert boolean columns to int64
df['Parking'] = df['Parking'].astype('int64')
df['Warehouse'] = df['Warehouse'].astype('int64')
df['Elevator'] = df['Elevator'].astype('int64')

# Features and target
X = df[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address']].values
y = df[['Price(USD)']].values

# One-hot encode the 'Address' column
onehot_encoder = OneHotEncoder(sparse=False, drop='first')
address_encoded = onehot_encoder.fit_transform(df[['Address']])
feature_names = onehot_encoder.get_feature_names_out(['Address'])
df_encoded = pd.concat([df, pd.DataFrame(address_encoded, columns=feature_names)], axis=1)

# Drop the original 'Address' column
df_encoded = df_encoded.drop(columns=['Address'])



In [4]:
# Standardize features
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X.astype(float))
joblib.dump(scaler, 'scaler.joblib')


['scaler.joblib']

# Training/testing process

In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# K-Nearest Neighbors Regression
k = 3
knnr = KNeighborsRegressor(n_neighbors=k).fit(X_train, y_train)

# Predictions
yhat = knnr.predict(X_test)

# Model evaluation
r2 = r2_score(y_test, yhat)
mse = mean_squared_error(yhat, y_test)
acc = r2 * 100
print(f'R2 Score: {r2}')
print(f'Mean Squared Error: {mse}')
print(f'Accuracy: {acc:.2f}%')

# Save the trained model to a file
joblib.dump(knnr, 'knnr_model.joblib')

R2 Score: 0.7433699252165692
Mean Squared Error: 23521117013.477245
Accuracy: 74.34%


['knnr_model.joblib']

In [6]:
# Display the DataFrame with original and transformed 'Address' values
df_address_values = df[['Original_Address', 'Address']]
print(df_address_values)

        Original_Address  Address
0                Shahran      156
1                Shahran      156
2                 Pardis      117
3          Shahrake Qods      152
4         Shahrake Gharb      150
...                  ...      ...
3474  Southern Janatabad      163
3475            Niavaran      105
3476              Parand      115
3477              Dorous       39
3478              Parand      115

[3452 rows x 2 columns]
