In [142]:
import numpy as np
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#show every column
pd.set_option('display.max_columns', None)

## Joining the data

First of all, since we have three datasets, let's join them all into one big dataset.

In [143]:
data_houses = pd.read_csv("dataset_houses.csv")
data_apartments = pd.read_csv("dataset_apartments.csv")
data_sharehouses = pd.read_csv("dataset_sharehouses.csv")

data = pd.concat([data_houses, data_apartments, data_sharehouses])

## Exploratory analysis

We will now do a quick exploratory data analysis.

In [144]:
# Describe the numeric attributes.
data.describe()

Unnamed: 0,Rooms,Bedrooms,Number of floors,Built in year,Register number
count,7597.0,1663.0,2663.0,6622.0,890.0
mean,3.350401,3.265183,1.885468,1986.282845,8544396.0
std,2.342659,1.794841,0.902053,61.279443,17323350.0
min,1.0,1.0,1.0,1.0,16.0
25%,2.0,3.0,1.0,1969.0,1630701.0
50%,3.0,3.0,2.0,1995.0,3047403.0
75%,4.0,4.0,2.0,2021.0,9874400.0
max,47.0,33.0,19.0,2105.0,120842600.0


In [145]:
data.describe(include=[object])

Unnamed: 0,Title,Description,Link,Location,Total area,Condition,Readiness,Ownership,Energy mark,Data from realestate book,Notify about incorrect advertisement,Lisainfo,Kitchen,Sanitary arrangements,Heating and ventilation,Communications and security,This floor/Number of floors,Neighbourhood,Ground area,Kulud suvel/talvel,Cadastre no.,Additional information
count,7988,7988,7988,7816,7837,7068,1186,6993,5949,4498,3791,6330,5052,6149,6357,5373,5083,4332,2102,471,4165,709
unique,6570,2635,7988,5663,2038,8,5,7,10,1,1,4859,423,1319,614,605,168,1737,1577,242,2994,596
top,"Apartment for sale, 2 rooms, Järve 2, Kristiin...","Apartment ownership, stone house",https://www.kv.ee/muua-tapa-linnas-renoveerimi...,"59.4019286,24.7271115",120 m²,all brand-new,ready,apartment ownership,Missing,Data from realestate book,Notify about incorrect advertisement,"ventilation, parquet, lift, parking free parking",open kitchen,shower,central heating,frontdoor locked,1/2,roads paved roads,600 m²,60 € / 120 €,79301:001:0984,"underground garage, box-room, new electricity,..."
freq,31,2203,1,60,51,2499,1047,4753,1697,4498,3791,82,745,412,1983,464,477,272,24,16,36,20


In [146]:
data.columns

Index(['Title', 'Description', 'Link', 'Location', 'Rooms', 'Bedrooms',
       'Total area', 'Number of floors', 'Built in year', 'Condition',
       'Readiness', 'Ownership', 'Energy mark', 'Data from realestate book',
       'Notify about incorrect advertisement', 'Lisainfo', 'Kitchen',
       'Sanitary arrangements', 'Heating and ventilation',
       'Communications and security', 'This floor/Number of floors',
       'Neighbourhood', 'Ground area', 'Kulud suvel/talvel', 'Cadastre no.',
       'Register number', 'Additional information'],
      dtype='object')

## Cleaning the data

First, drop the columns which are useless for predicting the prices. Apart from the obvious ones, drop also the summer/winter costs because there are so few samples.

In [147]:
data.drop(columns=["Title", "Description", "Link", "Cadastre no.", "Data from realestate book", "Notify about incorrect advertisement", "Kulud suvel/talvel"], inplace=True)

data.describe(include=[object])
data.columns

Unnamed: 0,Location,Total area,Condition,Readiness,Ownership,Energy mark,Lisainfo,Kitchen,Sanitary arrangements,Heating and ventilation,Communications and security,This floor/Number of floors,Neighbourhood,Ground area,Additional information
count,7816,7837,7068,1186,6993,5949,6330,5052,6149,6357,5373,5083,4332,2102,709
unique,5663,2038,8,5,7,10,4859,423,1319,614,605,168,1737,1577,596
top,"59.4019286,24.7271115",120 m²,all brand-new,ready,apartment ownership,Missing,"ventilation, parquet, lift, parking free parking",open kitchen,shower,central heating,frontdoor locked,1/2,roads paved roads,600 m²,"underground garage, box-room, new electricity,..."
freq,60,51,2499,1047,4753,1697,82,745,412,1983,464,477,272,24,20


Index(['Location', 'Rooms', 'Bedrooms', 'Total area', 'Number of floors',
       'Built in year', 'Condition', 'Readiness', 'Ownership', 'Energy mark',
       'Lisainfo', 'Kitchen', 'Sanitary arrangements',
       'Heating and ventilation', 'Communications and security',
       'This floor/Number of floors', 'Neighbourhood', 'Ground area',
       'Register number', 'Additional information'],
      dtype='object')

## Simple data type conversions

Remove the m2 from the area field to convert them to numeric.

In [148]:
data["Total area"] = data['Total area'].astype('str').map(lambda x: x.split()[0]).astype('float64') #remove m2
data["Ground area"] = data['Ground area'].astype('str').map(lambda x: x.split()[0]).astype('float64') #remove m2

Convert the "Energy mark" into a number.

In [149]:
def remap_column(data, column, mapping):
    data[column] = data[column].map(mapping)

In [150]:
data["Energy mark"].unique()
# data["Energy mark"] = data["Energy mark"].map({'C': 2, 'B': 1, np.nan: 0, '-': 0, 'Missing': 0, 'A': 0, 'H': 7, 'E': 4, 'D': 3, 'G': 6, 'F': 5})
remap_column(data, "Energy mark", {'C': 2, 'B': 1, np.nan: 0, '-': 0, 'Missing': 0, 'A': 0, 'H': 7, 'E': 4, 'D': 3, 'G': 6, 'F': 5})
data["Energy mark"].unique()

array(['C', 'B', nan, '-', 'Missing', 'A', 'H', 'E', 'D', 'G', 'F'],
      dtype=object)

array([2, 1, 0, 7, 4, 3, 6, 5], dtype=int64)

In [151]:
data["Condition"].unique()
remap_column(data, "Condition", {'all brand-new': 6, 'Good condition': 5, 'ready': 5, 'needs renovating': 0, 'satisfactory': 1, np.nan: 1, 'sanitary renovation needed': 2, 'renovated': 4,'sanitary renovation done': 3})
data["Condition"].unique()

array(['all brand-new', 'Good condition', 'ready', 'needs renovating',
       'satisfactory', nan, 'sanitary renovation needed', 'renovated',
       'sanitary renovation done'], dtype=object)

array([6, 5, 0, 1, 2, 4, 3], dtype=int64)

In [152]:
data["Readiness"].unique()
remap_column(data, "Readiness", {'ready': 4, 'roofed box': 2, np.nan: 4, 'box with doors and windows': 3, 'roofless box': 1, 'foundation': 0})
data["Readiness"].unique()

array(['ready', 'roofed box', nan, 'box with doors and windows',
       'roofless box', 'foundation'], dtype=object)

array([4, 2, 3, 1, 0], dtype=int64)

In [153]:
data["Ownership"].unique()

array(['private property', nan, 'apartment association',
       'joint ownership', 'apartment ownership', 'movable',
       'building lease', 'logical part'], dtype=object)

## Separating features

First of all, try to find which columns contain many different features.

Before converting columns to multiple features, it might be useful to assign reasonable values to NaNs in those columns!

In [154]:
# Create a function to assist in removing invalid values.
def fill_na_with_mode(data, column):
    return data[column].fillna(value=data[column].mode().values[0], inplace=True)

def fill_na_with_mean(data, column):
    return data[column].fillna(value=data[column].mean(), inplace=True)

In [155]:
fill_na_with_mode(data, "Ownership")
data = pd.get_dummies(data, columns=["Ownership"])

Now, some more complex feature extraction.

First, what are the unique feature values in the column "Lisainfo".

In [156]:
def find_unique_features(dataset, column):
    unique_features = set()

    for entry in dataset[column].unique():
        if not pd.isna(entry):
            features = entry.strip().split(", ")
            #print(features)
            unique_features.update(features)

    return unique_features

unique_features = find_unique_features(data, "Lisainfo")
unique_features

{'3*380V',
 'TV-set',
 'balcony',
 'balcony ',
 'balcony 0.5 m²',
 'balcony 0.9 m²',
 'balcony 1 m²',
 'balcony 1.4 m²',
 'balcony 1.5 m²',
 'balcony 1.6 m²',
 'balcony 1.8 m²',
 'balcony 10 m²',
 'balcony 10.0 m²',
 'balcony 10.1 m²',
 'balcony 10.2 m²',
 'balcony 10.3 m²',
 'balcony 10.4 m²',
 'balcony 10.5 m²',
 'balcony 10.6 m²',
 'balcony 10.7 m²',
 'balcony 10.8 m²',
 'balcony 10.9 m²',
 'balcony 11 m²',
 'balcony 11.2 m²',
 'balcony 11.4 m²',
 'balcony 11.5 m²',
 'balcony 11.6 m²',
 'balcony 11.7 m²',
 'balcony 11.8 m²',
 'balcony 11.9 m²',
 'balcony 12 m²',
 'balcony 12.1 m²',
 'balcony 12.2 m²',
 'balcony 12.3 m²',
 'balcony 12.4 m²',
 'balcony 12.5 m²',
 'balcony 12.6 m²',
 'balcony 12.7 m²',
 'balcony 12.8 m²',
 'balcony 12.9 m²',
 'balcony 13 m²',
 'balcony 13.1 m²',
 'balcony 13.2 m²',
 'balcony 13.3 m²',
 'balcony 13.4 m²',
 'balcony 13.6 m²',
 'balcony 13.8 m²',
 'balcony 13.9 m²',
 'balcony 14 m²',
 'balcony 14.1 m²',
 'balcony 14.2 m²',
 'balcony 14.4 m²',
 'balcony 14

Extract numeric features from "Lisainfo"

In [157]:
data_balcony = []
data_balcony_size = []

for i in data["Lisainfo"].values:
    balcony = 0 #Base values
    balcony_size = 0.0
    
    if not pd.isna(i):
        features = i.strip().split(", ")
        
        for feature in features:
            if feature.find("balcony") == 0:
                balcony = 1
                balcony_el = feature.strip().split()
                if len(balcony_el) > 1:
                    balcony_size = float(balcony_el[1])
                    
    data_balcony.append(balcony)
    data_balcony_size.append(balcony_size)

data["Balcony"] = data_balcony
data["Balcony size"] = data_balcony_size

In [158]:
#Probably don't care about that feature
data_dist_from_Tallinn = []

for i in data["Lisainfo"].values:
    dist_from_Tallinn = 0.0
    
    if not pd.isna(i):
        features = i.strip().split(", ")
        
        for feature in features:
            if feature.find("distance from Tallinn") != -1:
                dist_from_Tallinn = float(feature.strip().split()[3])
    data_dist_from_Tallinn.append(dist_from_Tallinn)

data["Distance from Tallinn"] = data_dist_from_Tallinn

Extract categorical features from "Lisainfo".

In [159]:
data_addition = []

for i in data["Lisainfo"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("parking") != -1:
                current = ' '.join(feature.strip().split()[1:-1])
    data_addition.append(current)

data["Parking"] = data_addition

In [160]:
data_addition = []

for i in data["Lisainfo"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("wall") != -1:
                current = ' '.join(feature.strip().split()[1:])
    data_addition.append(current)

data["Wall"] = data_addition

In [161]:
data_addition = []

for i in data["Lisainfo"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("roof") != -1:
                current = ' '.join(feature.strip().split()[1:-1])
    data_addition.append(current)

data["Roof"] = data_addition

Remove the now separated features from 'unique features' "Lisainfo"

In [162]:
filtered_unique_features = [ x for x in unique_features if "balcony" not in x and 
                                                            "distance from Tallinn" not in x and 
                                                            "parking" not in x and
                                                            "wall" not in x and 
                                                            "roof" not in x ]

Separate the remaining features

In [163]:
def separate(dataset, uniques, column):
    for unique_feature in uniques:
        data_addition = []

        for i in data[column].values:
            current = False

            if not pd.isna(i):
                features = i.strip().split(", ")

                for feature in features:
                    if feature.find(unique_feature) != -1:
                        current = True
            data_addition.append(current)

        dataset[unique_feature] = data_addition
        
separate(data, filtered_unique_features, "Lisainfo")
data.drop(columns=["Lisainfo"], inplace=True)

Find unique features in "Kitchen"

In [164]:
unique_features = find_unique_features(data, "Kitchen")
unique_features

{'ceramic shove',
 'electric stove',
 'gas stove',
 'induktsioon pliit',
 'kitchen ',
 'kitchen 0.0 m²',
 'kitchen 1 m²',
 'kitchen 10 m²',
 'kitchen 10.0 m²',
 'kitchen 10.2 m²',
 'kitchen 10.4 m²',
 'kitchen 10.6 m²',
 'kitchen 10.7 m²',
 'kitchen 10.8 m²',
 'kitchen 10.9 m²',
 'kitchen 11 m²',
 'kitchen 11.3 m²',
 'kitchen 11.5 m²',
 'kitchen 11.7 m²',
 'kitchen 11.8 m²',
 'kitchen 12 m²',
 'kitchen 12.2 m²',
 'kitchen 12.3 m²',
 'kitchen 12.5 m²',
 'kitchen 12.7 m²',
 'kitchen 12.8 m²',
 'kitchen 12.9 m²',
 'kitchen 13 m²',
 'kitchen 13.4 m²',
 'kitchen 13.7 m²',
 'kitchen 137 m²',
 'kitchen 14 m²',
 'kitchen 14.1 m²',
 'kitchen 14.5 m²',
 'kitchen 14.6 m²',
 'kitchen 141.4 m²',
 'kitchen 15 m²',
 'kitchen 15.1 m²',
 'kitchen 15.5 m²',
 'kitchen 15.6 m²',
 'kitchen 15.8 m²',
 'kitchen 16 m²',
 'kitchen 16.0 m²',
 'kitchen 16.3 m²',
 'kitchen 16.7 m²',
 'kitchen 17 m²',
 'kitchen 17.7 m²',
 'kitchen 17.8 m²',
 'kitchen 18 m²',
 'kitchen 18.1 m²',
 'kitchen 18.5 m²',
 'kitchen 19 m²'

Extract numeric features from "Kitchen"

In [165]:
#Probably don't care about that feature, too few entries
data_kitchen_size = []

for i in data["Kitchen"].values:
    kitchen_size = np.nan
    
    if not pd.isna(i):
        features = i.strip().split(", ")
        
        for feature in features:
            if feature.find("m²") != -1:
                kitchen_size = float(feature.strip().split()[1])
    data_kitchen_size.append(kitchen_size)

data["Kitchen size"] = data_kitchen_size

Extract categorical features from "Kitchen"

In [166]:
data_addition = []

for i in data["Kitchen"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("heated with firewood") != -1:
                current = ' '.join(feature.strip().split()[1:])
            elif feature.find("shove") != -1 or feature.find("stove") != -1 or feature.find("pliit") != -1:
                current = feature.strip().split()[0]
            
    data_addition.append(current)

data["Stove type"] = data_addition

Separate the remaining features

In [167]:
separate(data, {'kitchen furniture','open kitchen','refridgerator'}, "Kitchen")
data.drop(columns=["Kitchen"], inplace=True)

Find unique features in "Sanitary arrangements"

In [168]:
unique_features = find_unique_features(data, "Sanitary arrangements")
unique_features

{'bath',
 'local water',
 'new sewerage',
 'pool',
 'sauna',
 'sewerage',
 'shower',
 'toilet room and bathroom separate',
 'urban water',
 'washing machine',
 'water boiler'}

Extract features from "Sanitary arrangements"

In [169]:
separate(data, unique_features, "Sanitary arrangements")
data.drop(columns=["Sanitary arrangements"], inplace=True)

Find unique features in "Heating and ventilation"

In [170]:
unique_features = find_unique_features(data, "Heating and ventilation")
unique_features

{'central heating',
 'combined heating',
 'conditioner',
 'electric heating',
 'fireplace',
 'gas',
 'gas heating',
 'geothermic heating',
 'heated floors',
 'liquid fuel',
 'solid fuel',
 'stove heating',
 'õhksoojuspump'}

Extract categorical features from "Heating and ventilation"

In [171]:
data_addition = []

for i in data["Heating and ventilation"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("heating") != -1:
                current = feature.strip().split()[0]
            elif feature.find("õhksoojuspump") != -1:
                current = "Air source heat pump"
    data_addition.append(current)

data["Heating type"] = data_addition

Remove the now separated features from 'unique features'

In [172]:
filtered_unique_features = [ x for x in unique_features if "heating" not in x and 
                                                            "õhksoojuspump" not in x ]

Separate the remaining features

In [173]:
separate(data, filtered_unique_features, "Heating and ventilation")
data.drop(columns=["Heating and ventilation"], inplace=True)

Find unique features in "Communications and security"

In [174]:
unique_features = find_unique_features(data, "Communications and security")
unique_features

{'Internet',
 'cable TV',
 'fenced with garden',
 'frontdoor locked',
 'neighbourhood watch',
 'phone',
 'secure guard',
 'security system',
 'steel door',
 'video cameras'}

Extract features from "Communications and security"

In [175]:
separate(data, unique_features, "Communications and security")
data.drop(columns=["Communications and security"], inplace=True)

Find unique features in "Neighbourhood"

In [176]:
unique_features = find_unique_features(data, "Neighbourhood")
unique_features

{'200 sea',
 '3 km sea',
 'Abja paisjärv lake ',
 'Abja paisjärv lake 1,000 m',
 'Abja paisjärv lake 300 m',
 'Ahja lake 600 m',
 'Ahja river ',
 'Alesti järv lake 11 km',
 'Alesti järv lake 8 km',
 'Alesti lake 8 km',
 'Amme jõgi river 250 m',
 'Amme jõgi river 350 m',
 'Amme river 300 m',
 'Arbi ja Verevi järv lake 400 m',
 'Arbi järv lake 1,000 m',
 'Arbi järv lake 700 m',
 'Arbi lake 1,000 m',
 'Aseri sea 5 km',
 'Audru river 1.4 km',
 'Audru river 400 m',
 'Avijõgi river',
 'Avijõgi river 300 m',
 'Balti meri sea 2 m',
 'Balti meri sea 200 m',
 'Balti meri sea 450 m',
 'Baltika sea ',
 'Elva jõgi',
 'Elva jõgi river 1,000 m',
 'Emajõgi',
 'Emajõgi Emajõe river 1 m',
 'Emajõgi river',
 'Emajõgi river ',
 'Emajõgi river 1 m',
 'Emajõgi river 1,000 m',
 'Emajõgi river 1.2 km',
 'Emajõgi river 100 m',
 'Emajõgi river 140 m',
 'Emajõgi river 180 m',
 'Emajõgi river 20 m',
 'Emajõgi river 200 m',
 'Emajõgi river 300 m',
 'Emajõgi river 350 m',
 'Emajõgi river 50 m',
 'Emajõgi river 500 

In [177]:
filtered_unique_features = [ x for x in unique_features if "sea" not in x and 
                                                            "meri" not in x and
                                                            "river" not in x and
                                                            "lake" not in x and
                                                            "tiik" not in x and
                                                            "Tiik" not in x and
                                                            "Oja" not in x and
                                                            "oja" not in x and
                                                            "karjäär" not in x and
                                                            "карьер" not in x and
                                                            "water body" not in x and
                                                            "beach length" not in x and
                                                            "laht" not in x and
                                                            "rand" not in x and
                                                            "jõgi" not in x and
                                                            "järv" not in x and
                                                            "море" not in x ]
filtered_unique_features

['surrounding buildings private houses',
 'roads in satisfactory condition roads',
 'location outside settlement location',
 'surrounding buildings no buildings',
 'location in suburb location',
 'neighbours next to neighbours',
 'surrounding buildings warehouses and production buildings',
 'neighbours at one side neighbours',
 'roads gravel roads',
 'surrounding buildings private houses and apartment buildings',
 'neighbours farther neighbours',
 'neighbours around neighbours',
 'roads paved roads',
 'in the city location',
 'roads in good condition roads',
 'surrounding buildings apartment buildings',
 'near forest',
 'surrounding buildings commercial buildings',
 'roads in bad condition roads',
 'location in the center location']

Extract features from "Neighbourhood

In [178]:
data_addition = []

for i in data["Neighbourhood"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("condition") != -1:
                current = feature.strip().split()[2]
    data_addition.append(current)

data["Road condition"] = data_addition

In [179]:
data_addition = []

for i in data["Neighbourhood"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("condition") == -1:
                if feature.find("roads") != -1:
                    current = feature.strip().split()[1]
    data_addition.append(current)

data["Road type"] = data_addition

In [180]:
data_addition = []

for i in data["Neighbourhood"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("surrounding") != -1:
                current = ' '.join(feature.strip().split()[2:])
    data_addition.append(current)

data["Surrounding buildings"] = data_addition

In [181]:
data_addition = []

for i in data["Neighbourhood"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("location") != -1:
                if feature.find("city") != -1:
                    current = ' '.join(feature.strip().split()[0:-1])
                else:
                    current = ' '.join(feature.strip().split()[1:-1])
    data_addition.append(current)

data["Location type"] = data_addition

In [186]:
data_addition = []

for i in data["Neighbourhood"].values:
    current = np.nan

    if not pd.isna(i):
        features = i.strip().split(", ")

        for feature in features:
            if feature.find("neighbours") != -1:
                current = ' '.join(feature.strip().split()[1:])
                
    data_addition.append(current)

data["Nearby neighbours"] = data_addition

In [187]:
def separate_synonyms(dataset, uniques, column):
    for unique_features in uniques:
        data_addition = []

        for i in data[column].values:
            current = False

            if not pd.isna(i):
                features = i.strip().split(", ")

                for feature in features:
                    for unique_feature in unique_features:
                        if feature.find(unique_feature) != -1:
                            current = True
            data_addition.append(current)
        dataset[unique_features[0]] = data_addition
    

separate_synonyms(data, 
                  [['forest'], ['sea', 'meri', 'laht', 'море', 'rand'], 
                   ['river', 'jõgi'], ['lake', 'järv'], 
                   ['stream', 'oja', 'Oja'], 
                   ['pond', 'tiik', 'Tiik']], 
                  "Neighbourhood")

data.drop(columns=["Neighbourhood"], inplace=True)

In [188]:
data

Unnamed: 0,Location,Rooms,Bedrooms,Total area,Number of floors,Built in year,Condition,Readiness,Energy mark,This floor/Number of floors,Ground area,Register number,Additional information,Ownership_apartment association,Ownership_apartment ownership,Ownership_building lease,Ownership_joint ownership,Ownership_logical part,Ownership_movable,Ownership_private property,Balcony,Balcony size,Distance from Tallinn,Parking,Wall,Roof,basement,show case windows,dressing room,frontdoor locked,lift,trestle,garret,liquid fuel,TV-set,well,underground garage,gas,water,parquet,urban water,organisation through the few floors,deep well,local water,fenced with garden,sewerage,furniture,box-room,farm building,goods lift,electricity,cloak room,separate entryway,closed courtyard,new electricity,open kitchen,separate rooms,furnishing possibility,3*380V,public transport,street entrance,high ceilings,packet windows,near forest,pool,ventilation,new sewerage,garage,terrace,kitchen,Kitchen size,Stove type,refridgerator,kitchen furniture,sauna,toilet room and bathroom separate,shower,bath,washing machine,water boiler,Heating type,fireplace,heated floors,conditioner,solid fuel,secure guard,phone,Internet,steel door,video cameras,neighbourhood watch,cable TV,security system,Road condition,Road type,Surrounding buildings,Location type,forest,sea,river,lake,stream,pond,Nearby neighbours
0,"59.3677577,24.5949667",7.0,6.0,209.1,2.0,2020.0,6,4,2,,,,,0,0,0,0,0,0,1,0,0.0,0.0,free,cupboard,,False,False,True,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,True,False,True,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,True,False,False,False,False,True,True,False,,electric,True,True,False,False,True,True,False,False,geothermic,False,False,False,False,False,True,True,False,False,True,True,False,,,,,False,False,False,False,False,False,
1,"59.4689483,24.9601375",4.0,3.0,171.2,,2022.0,6,2,1,1/1,,,,0,0,0,0,0,0,1,0,0.0,0.0,free,,,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,,,False,False,True,True,True,True,False,True,Air source heat pump,True,True,True,False,False,False,True,True,False,False,False,False,good,,private houses and apartment buildings,,False,False,False,False,False,False,
2,"58.3754678,26.7311085",12.0,7.0,257.2,2.0,2017.0,5,4,0,,703.0,,,0,0,0,0,0,0,1,0,0.0,0.0,,cupboard,,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,,,True,True,True,True,True,True,True,True,Air source heat pump,False,True,False,False,False,True,True,False,False,False,False,False,good,,private houses and apartment buildings,in the city,False,False,True,False,False,False,
3,"58.3754678,26.7311085",6.0,3.0,121.1,2.0,2019.0,5,4,0,,,,,0,0,0,0,0,0,1,0,0.0,0.0,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,False,False,False,False,False,False,False,False,,False,False,False,False,False,False,False,False,False,False,False,False,,,,,False,False,False,False,False,False,
4,"58.3674254,24.5676331",4.0,3.0,158.9,2.0,2022.0,6,4,0,,1336.0,,,0,0,0,0,0,0,1,1,31.8,0.0,free,,,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,,,False,False,True,True,True,True,False,False,geothermic,False,True,False,False,False,False,True,False,False,True,True,True,good,,private houses,in suburb,True,False,True,False,False,False,next to neighbours
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,"58.3821146,24.5199999",10.0,,160.9,1.0,,4,4,0,,,,,1,0,0,0,0,0,0,0,0.0,0.0,,,tin,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,False,False,False,False,False,False,False,False,stove,False,False,False,False,False,False,False,False,False,False,False,False,,,,,False,False,False,False,False,False,
505,"58.3709809,26.7223744",4.0,3.0,114.4,2.0,2021.0,6,4,0,,,,,0,1,0,0,0,0,0,0,0.0,0.0,free,,,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,,,False,False,True,False,True,True,False,False,Air source heat pump,False,True,False,False,False,True,True,False,False,False,True,False,good,,,in the center,False,False,False,False,False,False,around neighbours
506,"57.8597455,26.9709845",6.0,,111.0,2.0,1959.0,1,4,0,,3186.0,,,0,1,0,0,0,0,0,0,0.0,0.0,free,cupboard,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,,,False,False,False,False,False,False,False,False,stove,False,False,False,False,False,False,False,False,False,False,False,False,good,,private houses,in suburb,False,False,False,False,False,False,
507,"59.2799412,24.6694508",5.0,4.0,159.5,1.0,2021.0,6,4,0,,,,,0,0,0,0,0,0,1,0,0.0,0.0,free,,,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,True,False,False,True,False,False,False,False,False,True,False,,,False,False,True,True,True,True,False,True,geothermic,False,True,False,False,False,False,False,False,False,False,False,False,,,private houses,,True,False,False,False,False,False,farther neighbours


## Dealing with NaN values

In [23]:
# data.dtypes

data["Ownership"].mode().values[0]

for column in data.columns:
    dtype = data[column].dtype

    if dtype == np.object_:
        print(data[column].mode())
        data[column].fillna(value=data[column].mode().values[0], inplace=True)
    else:
        data[column].fillna(value=data[column].mean(), inplace=True)

KeyError: 'Ownership'

In [None]:
data.describe(include=[np.object])

In [81]:
data.isna()

Unnamed: 0,Location,Rooms,Bedrooms,Total area,Number of floors,Built in year,Condition,Readiness,Ownership,Energy mark,...,Kitchen,Sanitary arrangements,Heating and ventilation,Communications and security,This floor/Number of floors,Neighbourhood,Ground area,Kulud suvel/talvel,Register number,Additional information
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
505,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
506,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
507,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
