In [668]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [669]:
df = pd.read_csv("./data/clean_data_with_region.csv")

In [670]:
nan_indices = df[df['Type of property'].isna()].index
print(nan_indices)

Index([], dtype='int64')


In [671]:
y = df[['price']]
X = df.drop(columns=['price'])
X.shape

(9393, 24)

In [672]:
# delete all columns with more than 30% missing values
for column in X:
    if X[column].isnull().sum(axis = 0) > len(X) * 0.3:
        X = X.drop(columns=[column])

In [673]:
X.columns
X.shape

(9393, 14)

In [674]:
X.shape

(9393, 14)

In [675]:
X = X.drop(columns=['Property ID', 'Locality name', 'Energy class', 'region'])

In [676]:
X.columns

Index(['Postal code', 'Type of property', 'Construction year',
       'Number of rooms', 'Living area', 'kitchen', 'State of builing',
       'Primary energy consumption', 'Heating type', 'Double glazing'],
      dtype='object')

In [677]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   int64  
 1   Type of property            9393 non-null   object 
 2   Construction year           6710 non-null   float64
 3   Number of rooms             9318 non-null   float64
 4   Living area                 8955 non-null   float64
 5   kitchen                     7393 non-null   float64
 6   State of builing            7792 non-null   object 
 7   Primary energy consumption  7877 non-null   float64
 8   Heating type                6753 non-null   object 
 9   Double glazing              7473 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 734.0+ KB


In [678]:
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
columns_to_impute_most_frequent = ['Number of rooms', 'kitchen', 'State of builing', 'Heating type', 'Double glazing']
imputer_mean = SimpleImputer(strategy='mean')
columns_to_impute_mean = ['Construction year', 'Living area', 'Primary energy consumption']

In [679]:
X[columns_to_impute_most_frequent] = imputer_most_frequent.fit_transform(X[columns_to_impute_most_frequent])
X[columns_to_impute_mean] = imputer_mean.fit_transform(X[columns_to_impute_mean])

In [680]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   int64  
 1   Type of property            9393 non-null   object 
 2   Construction year           9393 non-null   float64
 3   Number of rooms             9393 non-null   object 
 4   Living area                 9393 non-null   float64
 5   kitchen                     9393 non-null   object 
 6   State of builing            9393 non-null   object 
 7   Primary energy consumption  9393 non-null   float64
 8   Heating type                9393 non-null   object 
 9   Double glazing              9393 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 734.0+ KB


In [681]:
categorie = [['To restore', 'To renovate', 'To be done up', 'Good', 'Just renovated', 'As new']]

encoder = OrdinalEncoder(categories=categorie)
X['State_encoded'] = encoder.fit_transform(X[['State of builing']])

print(X['State_encoded'])
X = X.drop(columns=['State of builing'])

0       5.0
1       5.0
2       1.0
3       3.0
4       4.0
       ... 
9388    5.0
9389    5.0
9390    5.0
9391    3.0
9392    3.0
Name: State_encoded, Length: 9393, dtype: float64


In [683]:
# create encoder object
enc = OneHotEncoder(sparse_output=False, drop='first').set_output(transform="pandas")

# apply fit method to the data frame

encoded_data = enc.fit_transform(X[['Heating type', 'Type of property']])

X = pd.concat([X.drop(columns=['Type of property', 'Type of property']).reset_index(drop=True), encoded_data.reset_index(drop=True)], axis=1)

In [684]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   int64  
 1   Construction year           9393 non-null   float64
 2   Number of rooms             9393 non-null   object 
 3   Living area                 9393 non-null   float64
 4   kitchen                     9393 non-null   object 
 5   Primary energy consumption  9393 non-null   float64
 6   Heating type                9393 non-null   object 
 7   Double glazing              9393 non-null   object 
 8   State_encoded               9393 non-null   float64
 9   Heating type_Electric       9393 non-null   float64
 10  Heating type_Fuel oil       9393 non-null   float64
 11  Heating type_Gas            9393 non-null   float64
 12  Heating type_Pellet         9393 non-null   float64
 13  Heating type_Solar          9393 

In [685]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0)