In [14]:
import sqlite3
import pandas as pd
import seaborn as sns
import matplotlib as plt
import sklearn

# Data Exploration

In [8]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect(r"C:\Users\maxso\PycharmProjects\ApartmentScraper\booli\scrapy_apartments.db")
df = pd.read_sql_query("SELECT * from apartment", con)

# Verify that result of SQL query is stored in the dataframe
print(df.head().T)

                                 0                 1  \
id                 /annons/3848254    /bostad/280424   
type                      Lägenhet          Lägenhet   
city                       Uppsala       Helsingborg   
street                   Torngatan  Lilla Bergaliden   
street_num                      24                 1   
district              Kapellgärdet           Centrum   
postal_code                  75423             25223   
latitude                   59.8664           56.0449   
longitude                  17.6387           12.7005   
rooms                            2                 2   
square_meters                   55                74   
floor                            1                 2   
housing_society       BRF Tornet 1    BRF Minerva 16   
construction_year             2011              1939   
fee                           3126              5028   
price                    2.095e+06         2.175e+06   

                                               

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3518 entries, 0 to 3517
Data columns (total 16 columns):
id                   3518 non-null object
type                 3518 non-null object
city                 3518 non-null object
street               3425 non-null object
street_num           3425 non-null object
district             3518 non-null object
postal_code          3518 non-null int64
latitude             3518 non-null float64
longitude            3518 non-null float64
rooms                3518 non-null float64
square_meters        3518 non-null int64
floor                3518 non-null int64
housing_society      3002 non-null object
construction_year    3020 non-null float64
fee                  3130 non-null float64
price                3233 non-null float64
dtypes: float64(6), int64(3), object(7)
memory usage: 439.8+ KB


In [12]:
print(df.describe())

        postal_code     latitude    longitude        rooms  square_meters  \
count   3518.000000  3518.000000  3518.000000  3518.000000    3518.000000   
mean   36197.940875    58.433515    15.772856     2.675668      70.680785   
std    24613.566957     1.479407     2.438177     1.497836      42.634272   
min    11121.000000    55.510615    11.710321     1.000000       6.000000   
25%    12677.000000    57.699828    12.987134     2.000000      43.000000   
50%    23842.000000    59.287469    17.435687     2.000000      61.000000   
75%    58727.000000    59.357329    18.000705     3.000000      86.000000   
max    75758.000000    60.129254    18.391504    18.000000     430.000000   

             floor  construction_year           fee         price  
count  3518.000000        3020.000000   3130.000000  3.233000e+03  
mean      2.555429        1972.193377   3536.394888  3.247240e+06  
std       3.146444          39.235925   1427.639038  3.179767e+06  
min       0.000000        1722.000

In [15]:
df.isnull().sum()

id                     0
type                   0
city                   0
street                93
street_num            93
district               0
postal_code            0
latitude               0
longitude              0
rooms                  0
square_meters          0
floor                  0
housing_society      516
construction_year    498
fee                  388
price                285
dtype: int64

In [7]:
df.isnull().sum()/df.count() * 100

id                    0.000000
type                  0.000000
city                  0.000000
street                2.715328
street_num            2.715328
district              0.000000
postal_code           0.000000
latitude              0.000000
longitude             0.000000
rooms                 0.000000
square_meters         0.000000
floor                 0.000000
housing_society      17.188541
construction_year    16.490066
fee                  12.396166
price                 8.815342
dtype: float64

we can see that around 8%, or 285 rows hass missing values for the target variable. We can take these aside at a later point and, given that we are successful in creating an adequate model, use this model to predict these values. These rows can then be used to re-train a new model on. 

We can also see that almost 3% of *street* and *street_num* are missing. However, since *postal_code* technically contains roughly the same information, this will probably not be cause of any concern. We might even consider dropping these rows entirely.

Around 17% of the *housing_society* values are missing. This is what we will focus on exploring next.

## Housing Society exploration

In [23]:
df[df.housing_society.isnull()].head(50)

Unnamed: 0,id,type,city,street,street_num,district,postal_code,latitude,longitude,rooms,square_meters,floor,housing_society,construction_year,fee,price
6,/bostad/1846982,Villa,Helsingborg,Rusthållsgatan,5A,Miatorp,25231,56.015764,12.725562,5.0,112,0,,1985.0,,3295000.0
7,/bostad/2247046,Villa,Helsingborg,Grännagatan,7,Fältabacken,25250,56.039595,12.727825,4.0,95,0,,1924.0,,2995000.0
10,/bostad/2242476,Villa,Helsingborg,Alvägen,13,Laröd,25482,56.093354,12.647752,8.0,161,0,,1965.0,,7475000.0
33,/annons/3887215,Villa,Västerås,Fältspatsvägen,1,Munga,72594,59.742667,16.526091,2.0,55,0,,1973.0,,1995000.0
36,/bostad/2073765,Villa,Västerås,Malörtsvägen,16,Örtagården,72591,59.564192,16.465578,6.0,170,0,,2003.0,,5850000.0
37,/annons/3891158,Parhus,Helsingborg,Rååvägen,60B,Råå,25270,56.001603,12.737404,5.0,177,0,,2020.0,,6495000.0
51,/bostad/2815284,Villa,Helsingborg,Långgatan,17,Alla Bilder,25270,55.99796,12.739719,3.0,80,0,,,,3475000.0
52,/annons/3877132,Villa,Helsingborg,Lövstigen,1,Miatorp,25241,56.02371,12.711099,7.0,167,0,,1923.0,,3995000.0
53,/annons/3877528,Villa,Helsingborg,Färjemansgatan,32,Tågaborg,25440,56.06221,12.690828,9.0,250,0,,1923.0,,9200000.0
68,/bostad/2805939,Villa,Helsingborg,Brännerigatan,4A,Ödåkra,25474,56.105326,12.744431,10.0,297,0,,1929.0,,4495000.0


It looks like it's mostly different kinds of houses such as villas and terrace houses that are missing *house_society*, which is not very surprising. It is mainly *Lägenheter*/apartments that are tied to housing societies.

In [24]:
df.groupby("type").count()

Unnamed: 0_level_0,id,city,street,street_num,district,postal_code,latitude,longitude,rooms,square_meters,floor,housing_society,construction_year,fee,price
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Fritidshus,67,67,65,65,67,67,67,67,67,67,67,1,50,1,67
Gård,6,6,5,5,6,6,6,6,6,6,6,0,6,0,5
Kedjehus,13,13,12,12,13,13,13,13,13,13,13,2,13,3,12
Lägenhet,3081,3081,3015,3015,3081,3081,3081,3081,3081,3081,3081,2949,2639,3057,2811
Parhus,40,40,36,36,40,40,40,40,40,40,40,18,34,24,37
Radhus,81,81,77,77,81,81,81,81,81,81,81,31,76,43,80
Villa,230,230,215,215,230,230,230,230,230,230,230,1,202,2,221


In [39]:
#num of null-values per type
df.housing_society.isnull().groupby(df["type"]).sum()

type
Fritidshus     66.0
Gård            6.0
Kedjehus       11.0
Lägenhet      132.0
Parhus         22.0
Radhus         50.0
Villa         229.0
Name: housing_society, dtype: float64

In [46]:
#percentage of null-values per type
df.housing_society.isnull().groupby(df["type"]).mean() * 100

type
Fritidshus     98.507463
Gård          100.000000
Kedjehus       84.615385
Lägenhet        4.284323
Parhus         55.000000
Radhus         61.728395
Villa          99.565217
Name: housing_society, dtype: float64

In [79]:
df[(df["type"] == 'Lägenhet') & (df["housing_society"].isnull())]

Unnamed: 0,id,type,city,street,street_num,district,postal_code,latitude,longitude,rooms,square_meters,floor,housing_society,construction_year,fee,price
93,/annons/3645502,Lägenhet,Västerås,Notuddsallén,7,Öster Mälarstrand,72358,59.605221,16.570524,3.0,88,2,,2018.0,4840.0,3144000.0
107,/annons/3782832,Lägenhet,Västerås,Poseidongatan,17,Öster Mälarstrand,72358,59.604871,16.571485,2.0,57,6,,2020.0,3699.0,2637000.0
201,/annons/3734183,Lägenhet,Västerås,Stora gatan,1D,Centrum,72212,59.611443,16.554942,4.0,128,3,,2008.0,7730.0,2795000.0
295,/annons/3893744,Lägenhet,Västerås,Välplanerad Etta - Stentorpsgatan,24A,Malmaberg,72343,59.624435,16.591758,1.0,26,1,,1956.0,1911.0,595000.0
334,/annons/3785800,Lägenhet,Uppsala,,,Kvarngärdet,75425,59.873642,17.640564,2.0,53,1,,2022.0,3350.0,1895000.0
464,/annons/3851345,Lägenhet,Uppsala,Wivalliusgatan,47,Nyby,75442,59.886590,17.656200,4.0,87,1,,,3920.0,1895000.0
474,/bostad/224045,Lägenhet,Uppsala,Petterslundsgatan,13,Fålhagen,75328,59.864015,17.654254,4.0,93,3,,1963.0,3961.0,
478,/annons/3846058,Lägenhet,Uppsala,Åskmolnsvägen,93,Storvreta,74335,59.950016,17.695329,3.0,77,1,,1992.0,5581.0,
479,/annons/3846064,Lägenhet,Uppsala,,,Kåbo,75645,59.835191,17.632985,2.0,60,4,,,3458.0,2200000.0
480,/annons/3846073,Lägenhet,Uppsala,,,Kåbo,75645,59.835191,17.632985,2.0,57,4,,,3458.0,2150000.0


We can see that all types except apartments have fairly high percentages of missing values.

In [81]:
df_apartments_without_hs = df[(df["type"] == 'Lägenhet') & (df["housing_society"].isnull())]
df_apartments_with_hs = df[(df["type"] == 'Lägenhet') & (df["housing_society"].notnull())]

result = pd.merge(df_apartments_without_hs,
                 df_apartments_with_hs[['housing_society', 'postal_code']],
                 on='postal_code')

In [7]:
con.close()