In [43]:
import pandas as pd
import numpy as np
import pickle
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


trail = pd.read_csv(r'C:\Users\Karthick Palanivel\Documents\GitHub\Price_prediction\data\final.csv')

In [44]:
trail.columns

Index(['Type of property', 'Location', 'postal code', 'immo code', 'Price',
       'Address', 'Bedrooms', 'Energy class', 'Primary energy consumption',
       'Furnished', 'Terrace', 'Terrace surface', 'Surface of the plot',
       'Living room surface', 'Number of frontages', 'Construction year',
       'Building condition', 'Outdoor parking space', 'Bathrooms',
       'Shower rooms', 'Office', 'Toilets', 'Kitchen type', 'Heating type',
       'province'],
      dtype='object')

In [45]:
trail.rename(columns=({'Type of property' : 'type_of_property','Shower rooms':'shower_rooms','Office':'office','Construction year': 'construction_year','Outdoor parking space': 'outdoor_parking_space','Furnished':'furnished','Terrace':'terrace', 'Terrace surface':'terrace_surface', 'Price':'price', 'Address':'address', 'Primary energy consumption':'primary_energy_consumption', 'Location':'location','postal code': 'postal_code','immo code':'immo_code', 'Energy class' : 'energy_class', 'Bedrooms' : 'bedrooms', 'Bathrooms': 'bathrooms' , 'Toilets': 'toilets', 'Number of frontages': 'number_of_frontages','Kitchen type': 'kitchen_type','Heating type': 'heating_type', 'Surface of the plot': 'surface_of_the_plot',  'Living room surface': 'living_room_surface', 'province':'province' , 'Building condition':'building_condition'}),inplace=True)

In [48]:
trail.columns

Index(['type_of_property', 'location', 'postal_code', 'immo_code', 'price',
       'address', 'bedrooms', 'energy_class', 'primary_energy_consumption',
       'furnished', 'terrace', 'terrace_surface', 'surface_of_the_plot',
       'living_room_surface', 'number_of_frontages', 'construction_year',
       'building_condition', 'outdoor_parking_space', 'bathrooms',
       'shower_rooms', 'office', 'toilets', 'kitchen_type', 'heating_type',
       'province'],
      dtype='object')

In [52]:

# removing outliers in price and cleaning postal code
trail = trail[trail['price'] < 7.1e5]
trail = trail[trail['postal_code'].str.len() <= 4] 

trail = trail.drop(trail[trail['number_of_frontages']==14].index) # after romving,min = 6
trail = trail.drop(trail[trail['toilets'] > 9].index)

# since bathrooms and shower rooms are same adding it and removing >10 and (-1) values considering it as outliers 
trail['bathrooms']=trail['bathrooms']+trail['shower_rooms'] 
trail = trail.drop(trail[trail['bathrooms'] > 10].index)
trail = trail.drop(trail[trail['bathrooms'] == -1].index)

# replacing catagorical names in 'type of property'
trail['type_of_property'] = trail['type_of_property'].replace('new-real-estate-project-apartments', 'apartment')
trail['type_of_property'] = trail['type_of_property'].replace('new-real-estate-project-houses', 'house')
trail['type_of_property'] = trail['type_of_property'].replace('apartment-block', 'apartment')

trail = trail.drop(columns=['address', 'postal_code','location', 'furnished', 'immo_code','construction_year','terrace', 'office' ,'primary_energy_consumption','terrace_surface','outdoor_parking_space','shower_rooms'], axis=1)


column=[ 'type_of_property', 'building_condition', 'kitchen_type',  'province','energy_class', 'heating_type',] # catagorical
#column1 = ['Surface of the plot', 'Living room surface', ] # scaling to  be 
#column3= ['Number of frontages','Bedrooms','Bathrooms', 'Toilets', 'Price', 'postal code', 'Furnished'] # as it is


In [53]:
# using onehot encoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
one_hot_encoded_array = ohe.fit_transform(trail[column]).toarray()
pickle.dump(ohe, open('ohe.pickle', 'wb'))

print(one_hot_encoded_array)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [54]:
ohe.categories_

[array(['apartment', 'bungalow', 'castle', 'chalet', 'country-cottage',
        'duplex', 'exceptional-property', 'farmhouse', 'flat-studio',
        'ground-floor', 'house', 'kot', 'loft', 'manor-house', 'mansion',
        'mixed-use-building', 'other-property', 'penthouse',
        'service-flat', 'town-house', 'triplex', 'villa'], dtype=object),
 array(['0', 'As new', 'Good', 'Just renovated', 'To be done up',
        'To renovate', 'To restore'], dtype=object),
 array(['0', 'Hyper equipped', 'Installed', 'Not installed',
        'Semi equipped', 'USA hyper equipped', 'USA installed',
        'USA semi equipped', 'USA uninstalled'], dtype=object),
 array(['Antwerp', 'Brussels Capital Region', 'East Flanders',
        'Flemish Brabant', 'Hainaut (West)', 'Limburg', 'Liège',
        'Luxembourg (shared with Eastern Hainaut)', 'Namur',
        'Walloon Brabant', 'West Flanders', nan], dtype=object),
 array(['A', 'A+', 'A++', 'B', 'C', 'C_B', 'D', 'E', 'F', 'G', 'G_F', 'NS'],
       dty

In [55]:
#columns=[ 'type_of_property', 'building_condition', 'kitchen_type',  'province', 'energy_class', 'heating_type',]
categories = np.concatenate(ohe.categories_)
print(categories)

['apartment' 'bungalow' 'castle' 'chalet' 'country-cottage' 'duplex'
 'exceptional-property' 'farmhouse' 'flat-studio' 'ground-floor' 'house'
 'kot' 'loft' 'manor-house' 'mansion' 'mixed-use-building'
 'other-property' 'penthouse' 'service-flat' 'town-house' 'triplex'
 'villa' '0' 'As new' 'Good' 'Just renovated' 'To be done up'
 'To renovate' 'To restore' '0' 'Hyper equipped' 'Installed'
 'Not installed' 'Semi equipped' 'USA hyper equipped' 'USA installed'
 'USA semi equipped' 'USA uninstalled' 'Antwerp' 'Brussels Capital Region'
 'East Flanders' 'Flemish Brabant' 'Hainaut (West)' 'Limburg' 'Liège'
 'Luxembourg (shared with Eastern Hainaut)' 'Namur' 'Walloon Brabant'
 'West Flanders' nan 'A' 'A+' 'A++' 'B' 'C' 'C_B' 'D' 'E' 'F' 'G' 'G_F'
 'NS' '0' 'Carbon' 'Electric' 'Fuel oil' 'Gas' 'Pellet' 'Solar' 'Wood']


In [56]:
#creating dataframe with enoded data and removing unwanted columns
encoded_dataframe = pd.DataFrame(one_hot_encoded_array, columns=categories)
encoded_dataframe = encoded_dataframe.drop(columns=['0', 'NS', 'C_B', 'G_F'])
encoded_dataframe

Unnamed: 0,apartment,bungalow,castle,chalet,country-cottage,duplex,exceptional-property,farmhouse,flat-studio,ground-floor,...,E,F,G,Carbon,Electric,Fuel oil,Gas,Pellet,Solar,Wood
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7564,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
#removing encoded columns from trail
trail = trail.drop(columns=[ 'type_of_property', 'building_condition', 'kitchen_type',  'province', 'energy_class', 'heating_type',]).reset_index(drop=True)


In [37]:
# Models

def LinearReg(train_x,train_y, test_x,test_y):

    print('working in Linear Regression')
    # initialising 'linear regression' and creating a model
    linear = LinearRegression()
    linear.fit(train_x,train_y)

    # score of train
    train_score = linear.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = linear.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

    # saving the model
    filename = 'linear_pickle.pickle'
    pickle.dump(linear, open(filename, "wb"))


def DecisionTreeReg(train_x,train_y, test_x,test_y):

    print('working in Decision Tree Regression')

    #max_depth=7, min_samples_leaf=4, min_samples_split=10
    DT = DecisionTreeRegressor(max_depth=10,  min_samples_leaf=10, min_samples_split=10)
    DT.fit(train_x,train_y)

    # score of train
    train_score = DT.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = DT.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

    # saving the model
    filename = 'DT_pickle.pickle'
    pickle.dump(DT, open(filename, "wb"))

def NeighborsReg(train_x,train_y, test_x,test_y):

    print('working in Neighbors Regression')
    knn = KNeighborsRegressor(n_neighbors=3)
    knn.fit(train_x,train_y)

    # score of train
    train_score = knn.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = knn.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

    # saving the model
    filename = 'knn_pickle.pickle'
    pickle.dump(knn, open(filename, "wb"))

def RandomForestReg(train_x,train_y, test_x,test_y):

    print('working in Random Forest Regressor')
   
    y_train = np.ravel(train_y)
    y_test = np.ravel(test_y)
    forest = RandomForestRegressor()
    forest.fit(train_x,train_y)

    # score of train
    train_score = forest.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = forest.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

    # saving the model
    filename = 'forest_pickle.pickle'
    pickle.dump(forest, open(filename, "wb"))

"""
# load model
loaded_model = pickle.load(open(filename, "rb"))

# you can use loaded model to compute predictions
y_predicted = loaded_model.predict(X)
"""

def xgboostReg(train_x,train_y, test_x,test_y):

    print('working in xgboost')
    xgb_regressor = XGBRegressor()
    xgb_regressor.fit(train_x,train_y)

    Y_pred_train = xgb_regressor.predict(train_x)
    Y_pred_test = xgb_regressor.predict(test_x)

    train_mse = mean_squared_error(train_y, Y_pred_train)
    test_mse = mean_squared_error(test_y, Y_pred_test)

    # score of train
    train_score = xgb_regressor.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = xgb_regressor.score(test_x,test_y)
    print("Test Score:", test_score)

    print("Train MSE:", train_mse)
    print("Test MSE:", test_mse)
    
    print('\n')
    print('-----------------------------')
    print('\n')

    # saving the model
    filename = 'xgb_reg_pickle.pickle'
    pickle.dump(xgb_regressor, open(filename, "wb"))


In [58]:
# merging trail(original) and encoded dataframe
new = pd.concat([trail,encoded_dataframe], axis=1)
new

Unnamed: 0,price,bedrooms,surface_of_the_plot,living_room_surface,number_of_frontages,bathrooms,toilets,apartment,bungalow,castle,...,E,F,G,Carbon,Electric,Fuel oil,Gas,Pellet,Solar,Wood
0,469000.0,3,760,34,4,1,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,285000.0,2,0,0,2,2,2,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,285000.0,2,0,0,2,2,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,425000.0,2,0,40,0,1,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,189000.0,2,360,32,2,1,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7562,465000.0,2,1221,45,4,1,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7563,499000.0,3,17656,0,4,1,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7564,450000.0,3,0,0,2,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7565,699000.0,3,0,0,3,2,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#initialising 
x = new.drop(['price'], axis=1)
y = new['price']
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# normalisation or scaling - train and test
minmax_scaler = MinMaxScaler()
x_train[['surface_of_the_plot', 'living_room_surface']] = minmax_scaler.fit_transform(x_train[[ 'surface_of_the_plot', 'living_room_surface']])
pickle.dump(minmax_scaler, open('minmax_scaler.pickle', 'wb'))

x_test[[ 'surface_of_the_plot', 'living_room_surface']]= minmax_scaler.transform(x_test[[ 'surface_of_the_plot', 'living_room_surface']])

display(x_train.head())

Unnamed: 0,bedrooms,surface_of_the_plot,living_room_surface,number_of_frontages,bathrooms,toilets,apartment,bungalow,castle,chalet,...,E,F,G,Carbon,Electric,Fuel oil,Gas,Pellet,Solar,Wood
4553,1,0.0,0.0,2,1,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1905,3,0.004588,0.125,0,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3333,3,0.00621,0.053571,2,2,2,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6452,2,0.0,0.110714,2,1,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6620,2,0.004283,0.0,3,1,1,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [41]:
warnings.filterwarnings("ignore")

LinearReg(x_train,y_train,x_test,y_test)
DecisionTreeReg(x_train,y_train,x_test,y_test)
NeighborsReg(x_train,y_train,x_test,y_test)
xgboostReg(x_train,y_train,x_test,y_test)
RandomForestReg(x_train,y_train,x_test,y_test)

working in Linear Regression
Train Score: 0.4025248348806967
Test Score: 0.37981106422012856


-----------------------------


working in Decision Tree Regression
Train Score: 0.47844178880045674
Test Score: 0.3145255049168646


-----------------------------


working in Neighbors Regression
Train Score: 0.6737816037608837
Test Score: 0.3162254478383787


-----------------------------


working in xgboost
Train Score: 0.8105285763559139
Test Score: 0.48622774709285577
Train MSE: 4121022263.972432
Test MSE: 11281223731.751583


-----------------------------


working in Random Forest Regressor
Train Score: 0.9130322207392907
Test Score: 0.4677305077564423


-----------------------------


