In [9]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
import time
import pickle
import numpy as np

In [10]:
model_file = "../../models/KNR_2"
data_file = "../../data/processed_data_2.csv"

In [11]:
data = pd.read_csv(data_file)
# Process type  ( Apartment: 0, Independent House: 1, Studio Apartment: 2, villa: 3 )
# Process status ( Ready to move: 0, under Construction: 1 )
for column in data:
    if column in ["bhk","status","house_type","new"]:
        # Get the value counts for the column
        value_counts = data[column].value_counts()
        # Find the values that occur fewer than 10 times
        to_drop = value_counts[value_counts < 10].index
        # Drop rows where the column's value is in the to_drop list
        data = data[~data[column].isin(to_drop)]
for column in data:
    if column in ["bhk","status","house_type","new"]:
        print("Column Name: ", column)
        print(data[column].unique())
        print(data[column].value_counts())

Column Name:  bhk
[2 1 3 4 5 6]
bhk
2    20154
1    16596
3     7278
4     1103
5      162
6       39
Name: count, dtype: int64
Column Name:  status
[1 0]
status
0    26904
1    18428
Name: count, dtype: int64
Column Name:  house_type
[0 2 1 3]
house_type
0    44580
2      597
3      110
1       45
Name: count, dtype: int64
Column Name:  new
[0 1]
new
1    27140
0    18192
Name: count, dtype: int64


In [12]:
def preprocessing(data:pd.DataFrame) -> pd.DataFrame:
    # data.drop(["status","new","house_type"], axis = 1, inplace = True)
    data = data.dropna()
    return data


In [13]:
processed_data = preprocessing(data)
for column in processed_data:
    print(column)

bhk
area
status
price_in_USD
house_type
new
region_Agripada
region_Airoli
region_Ambernath East
region_Ambernath West
region_Andheri East
region_Andheri West
region_Anjurdive
region_Badlapur East
region_Badlapur West
region_Bandra East
region_Bandra Kurla Complex
region_Bandra West
region_Belapur
region_Bhandup East
region_Bhandup West
region_Bhayandar East
region_Bhayandar West
region_Bhiwandi
region_Boisar
region_Borivali East
region_Borivali West
region_Byculla
region_Chembur
region_Colaba
region_Dadar East
region_Dadar West
region_Dahisar
region_Deonar
region_Diva
region_Dombivali
region_Dronagiri
region_Ghansoli
region_Ghatkopar East
region_Ghatkopar West
region_Girgaon
region_Goregaon East
region_Goregaon West
region_Jogeshwari East
region_Jogeshwari West
region_Juhu
region_Juinagar
region_Kalamboli
region_Kalwa
region_Kalyan East
region_Kalyan West
region_Kamothe
region_Kandivali East
region_Kandivali West
region_Kanjurmarg
region_Karanjade
region_Karjat
region_Kasheli
region_Kh

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = processed_data.drop(labels="price_in_USD",axis=1)
for column in X:
    if column in ["locality","region", "status", "age","type"]:
        X[column] = le.fit_transform(X[column])

Y = processed_data["price_in_USD"]
print(X.shape)
print(Y.shape)

(45332, 114)
(45332,)


In [15]:
from sklearn.model_selection import train_test_split
X_hold, X_test, Y_hold, Y_test = train_test_split(X,Y,test_size=0.1,random_state=0)
X_train, X_val, Y_train,Y_val = train_test_split(X_hold,Y_hold,test_size=0.1,random_state=0)

In [16]:
# from sklearn.model_selection import validation_curve
# 
# parameter_range = np.arange(1, 20, 1)
# 
# # Calculate accuracy on training and test set using the
# # gamma parameter with 5-fold cross validation
# train_score, val_score = validation_curve(KNeighborsRegressor(), X_hold, Y_hold,
#                                           param_name="n_neighbors", param_range=parameter_range ,
#                                           cv=4, scoring="r2")
# mean_train_score = np.mean(train_score, axis=1)
# std_train_score = np.std(train_score, axis=1)
# 
# mean_val_score = np.mean(val_score, axis=1)
# std_val_score = np.std(val_score, axis=1)
# 
# plt.plot(parameter_range, mean_train_score,
#          label="Training Score", color='b')
# plt.plot(parameter_range, mean_val_score,
#          label="Cross Validation Score", color='g')
# # Creating the plot
# plt.title("Validation Curve with K Neighbors Regressor")
# plt.xlabel("Neighbors")
# plt.ylabel("R2 Score")
# plt.tight_layout()
# plt.legend(loc='best')
# plt.show()

In [23]:
start = time.time()
# Create the model
model = KNeighborsRegressor(n_neighbors=20, weights='distance')
print("Start training...")
model.fit(X_train, Y_train)
end = time.time()
print("Time execution : ", end - start)
pickle.dump(model, open(model_file, 'wb'))

Start training...
Time execution :  0.028338193893432617


In [24]:
loaded_model = pickle.load(open(model_file, 'rb'))

In [25]:
#Make the predictions 
pred_table = pd.DataFrame(data={"actual value":Y_test, "predicted":loaded_model.predict(X_test)})
pred_table["difference"] = pred_table["actual value"] - pred_table["predicted"]
pred_table

Unnamed: 0,actual value,predicted,difference
14948,92400.0,92400.000000,0.000000
19778,120000.0,120000.000000,0.000000
27087,45600.0,45600.000000,0.000000
14980,114000.0,116000.000000,-2000.000000
32008,226800.0,684632.081889,-457832.081889
...,...,...,...
19053,165600.0,156000.000000,9600.000000
28963,150000.0,132000.000000,18000.000000
41577,43200.0,55075.000000,-11875.000000
5062,132000.0,144000.000000,-12000.000000


In [26]:
from sklearn.metrics import r2_score
#Test the model
pred_train = loaded_model.predict(X_train)
pred_val = loaded_model.predict(X_val)
pred_test = loaded_model.predict(X_test)
train_accuracy = r2_score(pred_train, Y_train)
val_accuracy = r2_score(pred_val, Y_val)
test_accuracy = r2_score(pred_test, Y_test)
print('Training accuracy: ',train_accuracy)
print('Validation accuracy: ', val_accuracy)
print('Test accuracy: ',test_accuracy)

Training accuracy:  0.9953146443579614
Validation accuracy:  0.7790583812470407
Test accuracy:  0.7347853732503922


In [27]:
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(Y_train, pred_train)
mse_val = mean_squared_error(Y_val, pred_val)
mse_test = mean_squared_error(Y_test, pred_test)
print('Training mean squared error: ', mse_train)
print('Validation mean squared error: ', mse_val)
print('Test mean squared error: ', mse_test)

Training mean squared error:  189667039.77541795
Validation mean squared error:  7857111420.257113
Test mean squared error:  9554414744.68697


In [28]:
from sklearn.metrics import mean_absolute_error
mae_train = mean_absolute_error(Y_train, pred_train)
mae_val = mean_absolute_error(Y_val, pred_val)
mae_test = mean_absolute_error(Y_test, pred_test)
print('Training mean absolute error: ', mae_train)
print('Validation mean absolute error: ', mae_val)
print('Test mean absolute error: ', mae_test)

Training mean absolute error:  5716.13947708507
Validation mean absolute error:  35149.05633079118
Test mean absolute error:  36393.57648801467
