In [416]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize


In [417]:
df = pd.read_csv("./data/clean_data_with_region.csv")

In [418]:
print("Number of rows (m) is " + str(df.shape[0]))
print("Number of features (n) is " + str(df.shape[1]))

Number of rows (m) is 9393
Number of features (n) is 25


In [419]:
df.columns

Index(['Property ID', 'Locality name', 'Postal code', 'price',
       'Type of property', 'Construction year', 'Number of rooms',
       'Surface of the plot', 'Living area', 'kitchen', 'furnished',
       'Open fire', 'Terrace', 'Garden', 'Garden orientation',
       'Number of facades', 'Swimming pool', 'State of builing',
       'Energy class', 'Primary energy consumption', 'Heating type',
       'Flood zone type', 'Double glazing', 'Cadastral income', 'region'],
      dtype='object')

In [420]:
df = df.drop(columns=['Locality name', 'region'])
# Deleted columns 'Locality name' and 'region'. Those can be derived from 'Postal code'

In [421]:
df.isnull().sum(axis = 0)

Property ID                      0
Postal code                      0
price                            0
Type of property                 0
Construction year             2683
Number of rooms                 75
Surface of the plot           4247
Living area                    438
kitchen                       2000
furnished                     5472
Open fire                     9029
Terrace                       5614
Garden                        7150
Garden orientation            8572
Number of facades             2881
Swimming pool                 7079
State of builing              1601
Energy class                     0
Primary energy consumption    1516
Heating type                  2640
Flood zone type               3130
Double glazing                1920
Cadastral income              4024
dtype: int64

In [422]:
df_cleaned = df
for column in df_cleaned:
    if df_cleaned[column].isnull().sum(axis = 0) > len(df_cleaned) * 0.3:
        df_cleaned = df_cleaned.drop(columns=[column])

df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Property ID                 9393 non-null   int64  
 1   Postal code                 9393 non-null   int64  
 2   price                       9393 non-null   int64  
 3   Type of property            9393 non-null   object 
 4   Construction year           6710 non-null   float64
 5   Number of rooms             9318 non-null   float64
 6   Living area                 8955 non-null   float64
 7   kitchen                     7393 non-null   float64
 8   State of builing            7792 non-null   object 
 9   Energy class                9393 non-null   object 
 10  Primary energy consumption  7877 non-null   float64
 11  Heating type                6753 non-null   object 
 12  Double glazing              7473 non-null   object 
dtypes: float64(5), int64(3), object(5

In [423]:
# Splitting the df in 2 dataframes. One with houses and one with apartments to see if the missing values of garden and terrace are below 30%
df_houses = df[df['Type of property'] == 'house']
df_apartment = df[df['Type of property'] == 'apartment']

In [424]:
# Column Type of property is no longer relevant
df_houses = df_houses.drop(columns=['Type of property'])
df_apartment = df_apartment.drop(columns=['Type of property'])


In [425]:
# Delete columns with more than 30% missing values
for column in df_houses:
    if df_houses[column].isnull().sum(axis = 0) > len(df_houses) * 0.3:
        df_houses = df_houses.drop(columns=[column])
print(df_houses.info())    

<class 'pandas.core.frame.DataFrame'>
Index: 5562 entries, 0 to 9392
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Property ID                 5562 non-null   int64  
 1   Postal code                 5562 non-null   int64  
 2   price                       5562 non-null   int64  
 3   Number of rooms             5514 non-null   float64
 4   Surface of the plot         5146 non-null   float64
 5   Living area                 5247 non-null   float64
 6   kitchen                     4418 non-null   float64
 7   Number of facades           4245 non-null   float64
 8   State of builing            4655 non-null   object 
 9   Energy class                5562 non-null   object 
 10  Primary energy consumption  4667 non-null   float64
 11  Heating type                4032 non-null   object 
 12  Double glazing              4412 non-null   object 
dtypes: float64(6), int64(3), object(4)
mem

In [426]:
# Delete columns with more than 30% missing values
for column in df_apartment:
    if df_apartment[column].isnull().sum(axis = 0) > len(df_apartment) * 0.3:
        df_apartment = df_apartment.drop(columns=[column])
        
print(df_apartment.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3831 entries, 7 to 9391
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Property ID                 3831 non-null   int64  
 1   Postal code                 3831 non-null   int64  
 2   price                       3831 non-null   int64  
 3   Construction year           3032 non-null   float64
 4   Number of rooms             3804 non-null   float64
 5   Living area                 3708 non-null   float64
 6   kitchen                     2975 non-null   float64
 7   State of builing            3137 non-null   object 
 8   Energy class                3831 non-null   object 
 9   Primary energy consumption  3210 non-null   float64
 10  Heating type                2721 non-null   object 
 11  Double glazing              3061 non-null   object 
dtypes: float64(5), int64(3), object(4)
memory usage: 389.1+ KB
None


In [427]:
print(df_cleaned.columns)
print(df_apartment.columns)
print(df_houses.columns)

Index(['Property ID', 'Postal code', 'price', 'Type of property',
       'Construction year', 'Number of rooms', 'Living area', 'kitchen',
       'State of builing', 'Energy class', 'Primary energy consumption',
       'Heating type', 'Double glazing'],
      dtype='object')
Index(['Property ID', 'Postal code', 'price', 'Construction year',
       'Number of rooms', 'Living area', 'kitchen', 'State of builing',
       'Energy class', 'Primary energy consumption', 'Heating type',
       'Double glazing'],
      dtype='object')
Index(['Property ID', 'Postal code', 'price', 'Number of rooms',
       'Surface of the plot', 'Living area', 'kitchen', 'Number of facades',
       'State of builing', 'Energy class', 'Primary energy consumption',
       'Heating type', 'Double glazing'],
      dtype='object')


Garden and terrace still have more than 30% missing values if you split up the dataframe. So I will continue using the main dataframe (df_cleaned)

In [428]:
# Flanders and Wallonia use a different system to determine the energy class. 
# So I removed that column because the label is not the same in the different regions. 
# The label is calculated on the basis of primary energy consumption and we do keep this column.
df_cleaned = df_cleaned.drop(columns=['Energy class'])

# Property ID is not a factor that determines the price
df_cleaned = df_cleaned.drop(columns=['Property ID'])

In [429]:
df_cleaned.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Postal code,9393.0,,,,5208.33493,3129.362311,1000.0,2220.0,4430.0,8500.0,9992.0
price,9393.0,,,,399892.698605,359376.071038,15000.0,229000.0,320000.0,448000.0,6495000.0
Type of property,9393.0,2.0,house,5562.0,,,,,,,
Construction year,6710.0,,,,1972.405365,40.676357,1753.0,1951.0,1973.0,2007.0,2027.0
Number of rooms,9318.0,,,,2.884954,1.247544,1.0,2.0,3.0,3.0,24.0
Living area,8955.0,,,,152.949637,88.787893,18.0,94.0,134.0,185.0,1600.0
kitchen,7393.0,,,,0.95374,0.210062,0.0,1.0,1.0,1.0,1.0
State of builing,7792.0,6.0,Good,3698.0,,,,,,,
Primary energy consumption,7877.0,,,,299.347467,212.486756,1.0,152.0,249.0,388.0,1995.0
Heating type,6753.0,7.0,Gas,4952.0,,,,,,,


In [430]:
# Filling NaN
df_cleaned['Double glazing'].unique()
df_cleaned['Double glazing'].fillna('Yes', inplace=True)
df_cleaned['Double glazing'].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Double glazing'].fillna('Yes', inplace=True)


array(['Yes', 'No'], dtype=object)

In [431]:
df_cleaned['Construction year'].fillna(df_cleaned['Construction year'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Construction year'].fillna(df_cleaned['Construction year'].mean(), inplace=True)


In [432]:
df_cleaned['Number of rooms'].fillna(df_cleaned['Number of rooms'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Number of rooms'].fillna(df_cleaned['Number of rooms'].mode()[0], inplace=True)


In [433]:
df_cleaned['Living area'].fillna(df_cleaned['Living area'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Living area'].fillna(df_cleaned['Living area'].mean(), inplace=True)


In [434]:
df_cleaned['kitchen'].fillna(df_cleaned['kitchen'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['kitchen'].fillna(df_cleaned['kitchen'].mode()[0], inplace=True)


In [435]:

df_cleaned['State of builing'].fillna(df_cleaned['State of builing'].mode()[0], inplace=True)
print(df_cleaned['State of builing'].unique())

['As new' 'To renovate' 'Good' 'Just renovated' 'To be done up'
 'To restore']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['State of builing'].fillna(df_cleaned['State of builing'].mode()[0], inplace=True)


In [436]:
df_cleaned['Primary energy consumption'].unique()
df_cleaned['Primary energy consumption'].fillna(df_cleaned['Primary energy consumption'].mean(), inplace=True)
print(df_cleaned['Primary energy consumption'].mean())
print(df_cleaned['Primary energy consumption'].unique())

299.34746730988957
[4.70000000e+01 1.36000000e+02 3.34000000e+02 2.87000000e+02
 2.31000000e+02 2.99347467e+02 3.59000000e+02 1.32000000e+02
 1.07900000e+03 4.08000000e+02 1.72000000e+02 1.34000000e+02
 3.18000000e+02 3.00000000e+02 4.52000000e+02 1.47000000e+02
 3.16000000e+02 2.27000000e+02 2.70000000e+02 5.04000000e+02
 1.97000000e+02 9.22000000e+02 9.80000000e+01 1.33000000e+02
 7.22000000e+02 2.30000000e+02 5.60000000e+02 2.41000000e+02
 2.91000000e+02 1.08000000e+02 5.31000000e+02 3.99000000e+02
 4.47000000e+02 3.31000000e+02 2.74000000e+02 1.85000000e+02
 3.92000000e+02 3.62000000e+02 1.89000000e+02 3.89000000e+02
 2.23000000e+02 1.49000000e+02 2.33000000e+02 1.07000000e+02
 2.03000000e+02 3.87000000e+02 1.91000000e+02 1.21000000e+02
 9.70000000e+01 5.89000000e+02 2.72000000e+02 7.95000000e+02
 3.97000000e+02 2.40000000e+02 1.64000000e+02 1.45000000e+02
 1.78000000e+02 3.65000000e+02 5.30000000e+01 1.60000000e+01
 8.60000000e+01 5.55000000e+02 7.55000000e+02 4.33000000e+02
 1.13

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Primary energy consumption'].fillna(df_cleaned['Primary energy consumption'].mean(), inplace=True)


In [437]:
df_cleaned['Heating type'].unique()
df_cleaned['Heating type'].fillna('Gas', inplace=True)
df_cleaned['Heating type'].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Heating type'].fillna('Gas', inplace=True)


array(['Gas', 'Fuel oil', 'Pellet', 'Electric', 'Wood', 'Solar', 'Carbon'],
      dtype=object)

In [438]:
df_cleaned.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Postal code,9393.0,,,,5208.33493,3129.362311,1000.0,2220.0,4430.0,8500.0,9992.0
price,9393.0,,,,399892.698605,359376.071038,15000.0,229000.0,320000.0,448000.0,6495000.0
Type of property,9393.0,2.0,house,5562.0,,,,,,,
Construction year,9393.0,,,,1972.405365,34.378894,1753.0,1962.0,1972.405365,1993.0,2027.0
Number of rooms,9393.0,,,,2.885872,1.242595,1.0,2.0,3.0,3.0,24.0
Living area,9393.0,,,,152.949637,86.692845,18.0,95.0,140.0,182.0,1600.0
kitchen,9393.0,,,,0.96359,0.187318,0.0,1.0,1.0,1.0,1.0
State of builing,9393.0,6.0,Good,5299.0,,,,,,,
Primary energy consumption,9393.0,,,,299.347467,194.583351,1.0,171.0,294.0,351.0,1995.0
Heating type,9393.0,7.0,Gas,7592.0,,,,,,,


In [439]:

# create encoder object
enc = OneHotEncoder(sparse_output=False).set_output(transform="pandas")

# apply fit method to the data frame

output = enc.fit_transform(df_cleaned[['Double glazing']])

In [440]:
output['Double glazing_Yes'].unique()

array([1., 0.])

In [441]:
df_new = df_cleaned.drop('Double glazing', axis=1).join(output.drop('Double glazing_No', axis=1))

In [442]:
df_new.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Postal code,9393.0,,,,5208.33493,3129.362311,1000.0,2220.0,4430.0,8500.0,9992.0
price,9393.0,,,,399892.698605,359376.071038,15000.0,229000.0,320000.0,448000.0,6495000.0
Type of property,9393.0,2.0,house,5562.0,,,,,,,
Construction year,9393.0,,,,1972.405365,34.378894,1753.0,1962.0,1972.405365,1993.0,2027.0
Number of rooms,9393.0,,,,2.885872,1.242595,1.0,2.0,3.0,3.0,24.0
Living area,9393.0,,,,152.949637,86.692845,18.0,95.0,140.0,182.0,1600.0
kitchen,9393.0,,,,0.96359,0.187318,0.0,1.0,1.0,1.0,1.0
State of builing,9393.0,6.0,Good,5299.0,,,,,,,
Primary energy consumption,9393.0,,,,299.347467,194.583351,1.0,171.0,294.0,351.0,1995.0
Heating type,9393.0,7.0,Gas,7592.0,,,,,,,


In [443]:
df_new['Double glazing_Yes'].unique()

array([1., 0.])

In [444]:
type_of_property_map = {'house': 1, 'apartment':0}

In [445]:
df_new['house_1'] = df_new['Type of property'].map(type_of_property_map)
df_new = df_new.drop(columns=['Type of property'])

In [446]:
df_new.head()

Unnamed: 0,Postal code,price,Construction year,Number of rooms,Living area,kitchen,State of builing,Primary energy consumption,Heating type,Double glazing_Yes,house_1
0,9940,549000,2020.0,4.0,199.0,1.0,As new,47.0,Gas,1.0,1
1,9160,629000,1988.0,3.0,352.0,1.0,As new,136.0,Gas,1.0,1
2,7711,179000,1972.405365,3.0,147.0,0.0,To renovate,334.0,Gas,1.0,1
3,7390,195000,1972.405365,3.0,173.0,1.0,Good,287.0,Fuel oil,1.0,1
4,4606,350000,2000.0,2.0,145.0,1.0,Just renovated,231.0,Pellet,1.0,1


In [447]:
# create encoder object
#enc = OneHotEncoder(sparse_output=False).set_output(transform="pandas")

# apply fit method to the data frame

#output = enc.fit_transform(df_new[['Heating type']])

In [448]:
#print(output)

In [449]:
#output = output.drop(columns=['Heating type_Carbon'])

In [450]:
#df_new = df_new.drop(columns=['Heating type'])

In [451]:
#df_new = df_new.join(output)

In [452]:
#print(df_new)

In [453]:
categorie = [['To restore', 'To renovate', 'To be done up', 'Good', 'Just renovated', 'As new']]

encoder = OrdinalEncoder(categories=categorie)
df_new['State_encoded'] = encoder.fit_transform(df_new[['State of builing']])

df_new['State_encoded']
df_new = df_new.drop(columns=['State of builing'])

In [454]:
print(df_new)

      Postal code   price  Construction year  Number of rooms  Living area  \
0            9940  549000        2020.000000              4.0        199.0   
1            9160  629000        1988.000000              3.0        352.0   
2            7711  179000        1972.405365              3.0        147.0   
3            7390  195000        1972.405365              3.0        173.0   
4            4606  350000        2000.000000              2.0        145.0   
...           ...     ...                ...              ...          ...   
9388         2650  595000        1972.405365              4.0        154.0   
9389         2650  595000        1972.405365              4.0        154.0   
9390         8820  189000        1972.405365              3.0        118.0   
9391         8370  549000        1972.405365              2.0        103.0   
9392         7370  299500        1972.405365              3.0        326.0   

      kitchen  Primary energy consumption Heating type  Double 

In [455]:
df_new = df_new.drop(columns=['Heating type'])

In [456]:
df_new.columns

Index(['Postal code', 'price', 'Construction year', 'Number of rooms',
       'Living area', 'kitchen', 'Primary energy consumption',
       'Double glazing_Yes', 'house_1', 'State_encoded'],
      dtype='object')

In [457]:
from sklearn.preprocessing import MinMaxScaler

scaleMinMax = MinMaxScaler(feature_range=(0,1))
df_new = scaleMinMax.fit_transform(df_new)

In [458]:
df_new = pd.DataFrame(df_new, columns=['Postal code', 'price', 'Construction year', 'Number of rooms',
       'Living area', 'kitchen', 'Primary energy consumption',
       'Double glazing_Yes', 'house_1', 'State_encoded'])

In [467]:
df_new.head()

Unnamed: 0,Postal code,price,Construction year,Number of rooms,Living area,kitchen,Primary energy consumption,Double glazing_Yes,house_1,State_encoded
0,0.994217,0.082407,0.974453,0.130435,0.114412,1.0,0.023069,1.0,1.0,1.0
1,0.907473,0.094753,0.857664,0.086957,0.211125,1.0,0.067703,1.0,1.0,1.0
2,0.74633,0.025309,0.80075,0.086957,0.081542,0.0,0.167001,1.0,1.0,0.2
3,0.710632,0.027778,0.80075,0.086957,0.097977,1.0,0.14343,1.0,1.0,0.6
4,0.401023,0.051698,0.90146,0.043478,0.080278,1.0,0.115346,1.0,1.0,0.8


In [468]:
df_new.describe()

Unnamed: 0,Postal code,price,Construction year,Number of rooms,Living area,kitchen,Primary energy consumption,Double glazing_Yes,house_1,State_encoded
count,9393.0,9393.0,9393.0,9393.0,9393.0,9393.0,9393.0,9393.0,9393.0,9393.0
mean,0.468009,0.059397,0.80075,0.081994,0.085303,0.96359,0.149623,0.967423,0.592143,0.61697
std,0.348016,0.055459,0.12547,0.054026,0.0548,0.187318,0.097584,0.177537,0.491462,0.230134
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.135676,0.033025,0.762774,0.043478,0.048673,1.0,0.085256,1.0,0.0,0.6
50%,0.38145,0.047068,0.80075,0.086957,0.077118,1.0,0.146941,1.0,1.0,0.6
75%,0.834075,0.066821,0.875912,0.086957,0.103666,1.0,0.175527,1.0,1.0,0.6
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [459]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   float64
 1   price                       9393 non-null   float64
 2   Construction year           9393 non-null   float64
 3   Number of rooms             9393 non-null   float64
 4   Living area                 9393 non-null   float64
 5   kitchen                     9393 non-null   float64
 6   Primary energy consumption  9393 non-null   float64
 7   Double glazing_Yes          9393 non-null   float64
 8   house_1                     9393 non-null   float64
 9   State_encoded               9393 non-null   float64
dtypes: float64(10)
memory usage: 734.0 KB


In [460]:
df_new.to_csv('df_new.csv', index=False)

In [461]:

df_new.corr()

Unnamed: 0,Postal code,price,Construction year,Number of rooms,Living area,kitchen,Primary energy consumption,Double glazing_Yes,house_1,State_encoded
Postal code,1.0,0.018357,0.076442,0.04741,0.023756,0.049428,0.044745,-0.063626,0.146866,-0.037772
price,0.018357,1.0,0.088211,0.293451,0.382877,0.076853,-0.160849,-0.012085,0.030376,0.196372
Construction year,0.076442,0.088211,1.0,-0.160128,-0.13336,0.132301,-0.317101,0.000738,-0.271963,0.28235
Number of rooms,0.04741,0.293451,-0.160128,1.0,0.658652,-0.016482,0.111065,-0.029404,0.525975,-0.083629
Living area,0.023756,0.382877,-0.13336,0.658652,1.0,0.002482,0.09892,-0.030103,0.493273,-0.053932
kitchen,0.049428,0.076853,0.132301,-0.016482,0.002482,1.0,-0.148828,-0.019663,-0.041043,0.207976
Primary energy consumption,0.044745,-0.160849,-0.317101,0.111065,0.09892,-0.148828,1.0,-0.103146,0.356906,-0.46668
Double glazing_Yes,-0.063626,-0.012085,0.000738,-0.029404,-0.030103,-0.019663,-0.103146,1.0,-0.048573,0.032296
house_1,0.146866,0.030376,-0.271963,0.525975,0.493273,-0.041043,0.356906,-0.048573,1.0,-0.196363
State_encoded,-0.037772,0.196372,0.28235,-0.083629,-0.053932,0.207976,-0.46668,0.032296,-0.196363,1.0


In [462]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

X = df_new.drop(['price'], axis=1)
y = df_new['price']

# Split the training set into 
# training and validation set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0)

In [463]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [464]:
regressor.score(X_train, y_train)

0.23577331335449936

In [466]:
regressor.score(X_test, y_test)

0.2363194470036457

In [469]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [471]:
y_pred = regressor.predict(X_test)

In [473]:
print(mean_absolute_error(y_test, y_pred))

0.022757529081065694


In [474]:
mean_squared_error(y_test, y_pred)

np.float64(0.0024926950871092366)

In [475]:
r2_score(y_test, y_pred)

0.2363194470036457

In [476]:
regressor.coef_

array([ 0.00441885,  0.01779537,  0.16308274,  0.38132388,  0.00886203,
       -0.03786762, -0.00592468, -0.01969193,  0.03559993])