In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


trail = pd.read_csv('../data/final.csv')

trail = trail.drop(columns=['Energy class', 'Primary energy consumption','Heating type','Address', 'Location',  'immo code','Construction year','Terrace', 'Office' ,], axis=1)


# removing outliers in price and cleaning postal code
trail = trail[trail['Price'] < 7.1e5]
trail = trail[trail['postal code'].str.len() <= 4] 

trail = trail.drop(trail[trail['Number of frontages']==14].index) # after romving,min = 6
trail = trail.drop(trail[trail['Toilets'] > 9].index)

# since bathrooms and shower rooms are same adding it and removing >10 and (-1) values considering it as outliers 
trail['Bathrooms']=trail['Bathrooms']+trail['Shower rooms'] 
trail = trail.drop(trail[trail['Bathrooms'] > 10].index)
trail = trail.drop(trail[trail['Bathrooms'] == -1].index)

# replacing catagorical names in 'type of property'
trail['Type of property'] = trail['Type of property'].replace('new-real-estate-project-apartments', 'apartment')
trail['Type of property'] = trail['Type of property'].replace('new-real-estate-project-houses', 'house')
trail['Type of property'] = trail['Type of property'].replace('apartment-block', 'apartment')

# creating dummies
trail_with_dummies = pd.get_dummies(trail, columns=[ 'Type of property', 'Building condition', 'Kitchen type',  'province'], dtype='int')
trail_with_dummies['postal code'] = trail_with_dummies['postal code'].astype('int64')


In [None]:
# Models
def LinearReg(train_x,train_y, test_x,test_y):

    print('working in Linear Regression')
    # initialising 'linear regression' and creating a model
    linear = LinearRegression()
    linear.fit(train_x,train_y)

    # score of train
    train_score = linear.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = linear.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')


def DecisionTreeReg(train_x,train_y, test_x,test_y):

    print('working in Decision Tree Regression')

    #max_depth=7, min_samples_leaf=4, min_samples_split=10
    DT = DecisionTreeRegressor(max_depth=10,  min_samples_leaf=10, min_samples_split=10)
    DT.fit(train_x,train_y)

    # score of train
    train_score = DT.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = DT.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

def NeighborsReg(train_x,train_y, test_x,test_y):

    print('working in Neighbors Regression')
    knn = KNeighborsRegressor(n_neighbors=3)
    knn.fit(train_x,train_y)

    # score of train
    train_score = knn.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = knn.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

def RandomForestReg(train_x,train_y, test_x,test_y):

    print('working in Random Forest Regressor')
   
    y_train = np.ravel(train_y)
    y_test = np.ravel(test_y)
    forest = RandomForestRegressor()
    forest.fit(train_x,train_y)

    # score of train
    train_score = forest.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = forest.score(test_x,test_y)
    print("Test Score:", test_score)

    print('\n')
    print('-----------------------------')
    print('\n')

def xgboostReg(train_x,train_y, test_x,test_y):

    print('working in xgboost')
    xgb_regressor = XGBRegressor()
    xgb_regressor.fit(train_x,train_y)

    Y_pred_train = xgb_regressor.predict(train_x)
    Y_pred_test = xgb_regressor.predict(test_x)

    train_mse = mean_squared_error(train_y, Y_pred_train)
    test_mse = mean_squared_error(test_y, Y_pred_test)

    # score of train
    train_score = xgb_regressor.score(train_x,train_y)
    print("Train Score:", train_score)

    # score of test
    test_score = xgb_regressor.score(test_x,test_y)
    print("Test Score:", test_score)

    print("Train MSE:", train_mse)
    print("Test MSE:", test_mse)
    
    print('\n')
    print('-----------------------------')
    print('\n')


In [None]:
#office and terrace has morethan 5000 missing values
x = trail_with_dummies.drop([  'Price', 'Kitchen type_0', 'Building condition_0'], axis=1) #.to_numpy()
y = trail_with_dummies['Price'] #.to_numpy().reshape(-1,1)#

x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# normalisation or scaling - train and test
minmax_scaler = MinMaxScaler()
x_train[['Terrace surface', 'Surface of the plot', 'Living room surface']] = minmax_scaler.fit_transform(x_train[['Terrace surface', 'Surface of the plot', 'Living room surface']])
x_test[['Terrace surface', 'Surface of the plot', 'Living room surface']]= minmax_scaler.transform(x_test[['Terrace surface', 'Surface of the plot', 'Living room surface']])

display(x_train.head())

Unnamed: 0,Bedrooms,Furnished,Terrace surface,Surface of the plot,Living room surface,Number of frontages,Outdoor parking space,Bathrooms,Toilets,Type of property_apartment,...,province_Brussels Capital Region,province_East Flanders,province_Flemish Brabant,province_Hainaut (West),province_Limburg,province_Liège,province_Luxembourg (shared with Eastern Hainaut),province_Namur,province_Walloon Brabant,province_West Flanders
5155,1,0,0.0,0.0,0.0,2,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2127,3,0,0.0,0.004588,0.125,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
3733,3,0,0.0181,0.00621,0.053571,2,0,2,2,0,...,1,0,0,0,0,0,0,0,0,0
7305,2,0,0.0181,0.0,0.110714,2,0,1,1,1,...,1,0,0,0,0,0,0,0,0,0
7499,2,0,0.0,0.004283,0.0,3,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0


In [None]:
LinearReg(x_train,y_train,x_test,y_test)
DecisionTreeReg(x_train,y_train,x_test,y_test)
NeighborsReg(x_train,y_train,x_test,y_test)
xgboostReg(x_train,y_train,x_test,y_test)
RandomForestReg(x_train,y_train,x_test,y_test)

working in Linear Regression
Train Score: 0.36812932688806066
Test Score: 0.3276719503750979


-----------------------------


working in Decision Tree Regression
Train Score: 0.483625769011641
Test Score: 0.3134920748513489


-----------------------------


working in Neighbors Regression
Train Score: 0.6687679553766122
Test Score: 0.3116546630336837


-----------------------------


working in xgboost
Train Score: 0.7901072238869646
Test Score: 0.44303640770420105
Train MSE: 4565188706.417354
Test MSE: 12229603408.077723


-----------------------------


working in Random Forest Regressor
Train Score: 0.9052855649318231
Test Score: 0.44301588101128253


-----------------------------


