# Technical Requirements

- Python (preferably 3.9)
- Jupyter
- Anaconda
- git (just for your convenience)
- Stable internet connection

# Anaconda Walkthrough

[![anaconda](media\anaconda.png)](https://www.anaconda.com/products/distribution)

# How to Get Started?

## Problem Definition

<img src="media\student.png"/>

### How to choose proper place to rent an apartment as a student in Poland?

![learning](media\learning.png)

## The Most Important... DATA

[![kaggle](media\kaggle.png)](https://www.kaggle.com/datasets/dawidcegielski/house-prices-in-poland)

In [1]:
import os
import json

cities = ['cracow', 'warsaw', 'poznan']

for city in cities:
    with open(os.path.join('data', f'{city}.json'), 'r') as city_json:
        exec(f'{city} = json.load(city_json)')

In [2]:
columns = warsaw.keys()
columns

dict_keys(['floor', 'price', 'rooms', 'sq', 'year'])

## Data Quality 

In [3]:
full_data = {city: eval(city) for city in cities}

def apply(method):
    
    for city, data in full_data.items():
        print(city)
        method(data)

In [4]:
def count(data):
    
    for column, values in data.items():
        print(f'\t{column}: {len(values)}')
        
apply(count)

cracow
	floor: 8787
	price: 8787
	rooms: 8787
	sq: 8787
	year: 8787
warsaw
	floor: 8865
	price: 8865
	rooms: 8865
	sq: 8865
	year: 8865
poznan
	floor: 3734
	price: 3734
	rooms: 3734
	sq: 3734
	year: 3734


In [5]:
from numpy import isnan

nans = lambda values: sum([isnan(value) for value in values])

def nan_raport(relative=True):
    
    def nan_summary(data):
        for column, values in data.items():
            if relative:
                print(f'\t{column}: {round(nans(values) / len(values), 3)}')
            else:
                print(f'\t{column}: {nans(values)}')
                
    apply(nan_summary)

In [6]:
nan_raport()

cracow
	floor: 0.0
	price: 0.095
	rooms: 0.0
	sq: 0.0
	year: 0.0
warsaw
	floor: 0.0
	price: 0.094
	rooms: 0.0
	sq: 0.0
	year: 0.0
poznan
	floor: 0.0
	price: 0.098
	rooms: 0.0
	sq: 0.0
	year: 0.0


In [7]:
negatives = lambda values: sum([value < 0 for value in values])

def sign_raport(relative=True):
    
    def negatives_summary(data):
        for column, values in data.items():
            if relative:
                print(f'\t{column}: {round(negatives(values) / len(values), 3)}')
            else:
                print(f'\t{column}: {negatives(values)}')
    
    apply(negatives_summary)

In [8]:
sign_raport()

cracow
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
warsaw
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
poznan
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0


In [9]:
def time_raport():

    apply(lambda data: print(f"\tmin year: {min(data['year'])}\n\tmax year: {max(data['year'])}"))

In [10]:
time_raport()

cracow
	min year: 200
	max year: 2023
warsaw
	min year: 75
	max year: 2980
poznan
	min year: 70
	max year: 2023


## Data Processing

In [11]:
for city in cities:
    nan_index = [i for i, price in enumerate(full_data[city]['price']) if isnan(price)]
    for i, index in enumerate(nan_index):
        for values in full_data[city].values():
            values.pop(index - i)

In [12]:
nan_raport()

cracow
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
warsaw
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
poznan
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0


In [13]:
for city in cities:
    future_index = [i for i, year in enumerate(full_data[city]['year']) if year > 2022]
    for i, index in enumerate(future_index):
        for values in full_data[city].values():
            values.pop(index - i)

In [14]:
time_raport()

cracow
	min year: 200
	max year: 2022
warsaw
	min year: 75
	max year: 2022
poznan
	min year: 70
	max year: 2022


## Get Insights

In [15]:
import statistics

In [16]:
def get_stats(*args):
    
    def stats(data):
        for column, values in data.items():
            print(f'\t{column}')
            for stat in args:
                print(f'\t\t{stat}:', round(eval(f'statistics.{stat}({values})'), 3))
    
    apply(stats)

In [17]:
get_stats('mean', 'median', 'stdev')

cracow
	floor
		mean: 2.541
		median: 2.0
		stdev: 2.171
	price
		mean: 598044.333
		median: 499025.0
		stdev: 378678.702
	rooms
		mean: 2.599
		median: 3.0
		stdev: 0.996
	sq
		mean: 185.097
		median: 52.955
		stdev: 11323.933
	year
		mean: 2003.109
		median: 2020.0
		stdev: 42.546
warsaw
	floor
		mean: 3.284
		median: 3
		stdev: 2.815
	price
		mean: 781558.535
		median: 595000.0
		stdev: 706680.291
	rooms
		mean: 2.632
		median: 3
		stdev: 1.009
	sq
		mean: 63.108
		median: 54.39
		stdev: 104.903
	year
		mean: 1995.974
		median: 2008
		stdev: 42.18
poznan
	floor
		mean: 2.316
		median: 2.0
		stdev: 1.982
	price
		mean: 466891.429
		median: 429180.05
		stdev: 193247.907
	rooms
		mean: 2.659
		median: 3.0
		stdev: 0.98
	sq
		mean: 61.785
		median: 55.05
		stdev: 140.358
	year
		mean: 2003.285
		median: 2020.0
		stdev: 65.306


In [18]:
def quantiles(column, buckets):
    
    def quantiles_summary(data):
        
        for i, bucket in enumerate(statistics.quantiles(data[column], n=buckets)):
            print(f'\t{round((i + 1) / buckets * 100, 2)}%: {round(bucket, 2)}')
            
    apply(quantiles_summary)

In [19]:
quantiles('price', 4)

cracow
	25.0%: 406870.5
	50.0%: 499025.0
	75.0%: 669000.0
warsaw
	25.0%: 468750.0
	50.0%: 595000.0
	75.0%: 820000.0
poznan
	25.0%: 346985.5
	50.0%: 429180.05
	75.0%: 530084.85


In [20]:
quantiles('sq', 10)

cracow
	10.0%: 33.32
	20.0%: 38.03
	30.0%: 44.15
	40.0%: 48.54
	50.0%: 52.95
	60.0%: 58.02
	70.0%: 63.71
	80.0%: 71.02
	90.0%: 85.0
warsaw
	10.0%: 33.2
	20.0%: 39.29
	30.0%: 46.0
	40.0%: 50.0
	50.0%: 54.39
	60.0%: 60.0
	70.0%: 66.5
	80.0%: 77.34
	90.0%: 98.0
poznan
	10.0%: 33.08
	20.0%: 40.01
	30.0%: 46.0
	40.0%: 50.67
	50.0%: 55.05
	60.0%: 61.0
	70.0%: 66.35
	80.0%: 73.85
	90.0%: 86.96


In [21]:
def boundary_raport(variable):

    apply(lambda data: print(f"\tmin: {min(data[variable])}\n\tmax: {max(data[variable])}"))

In [22]:
boundary_raport('sq')

cracow
	min: 12.0
	max: 1007185.0
warsaw
	min: 11.0
	max: 9000.0
poznan
	min: 20.0
	max: 8065.0


In [23]:
boundary_raport('price')

cracow
	min: 89900.0
	max: 6400000.0
warsaw
	min: 5000.0
	max: 15000000.0
poznan
	min: 79900.0
	max: 2290000.0


In [24]:
def delete_outliers(data, percent=.1):

    percent /= 4
    for variable in ['sq', 'price']:
        n = len(data[variable])
        low, high = sorted(data[variable])[int(n*percent)], sorted(data[variable])[-int(n*percent)]
        to_delete = [i for i in range(len(data[variable])) if data[variable][i] > high or data[variable][i] < low]
        for i, delete_index, in enumerate(to_delete):
            for values in data.values():
                values.pop(delete_index - i)
    
apply(delete_outliers)

cracow
warsaw
poznan


In [25]:
boundary_raport('sq')

cracow
	min: 25.13
	max: 124.3
warsaw
	min: 25.0
	max: 152.0
poznan
	min: 28.1
	max: 120.0


In [26]:
boundary_raport('price')

cracow
	min: 279000.0
	max: 1263790.0
warsaw
	min: 320000.0
	max: 1999000.0
poznan
	min: 254000.0
	max: 870720.0


# What Next?

## Feature Engineering

In [27]:
from scipy.stats import pearsonr


def corr(variable, transform = lambda x: x):
    
    def corr_summary(data):
        main_values = [transform(value) for value in data[variable]]
        for column, values in data.items():
            print(f"\t{column}")
            results = pearsonr(main_values, values)
            print(f'\t\tcorrelation: {round(results[0], 3)}')
            print(f'\t\tp_value: {round(results[1], 3)}')
    
    apply(corr_summary)

In [28]:
corr('price')

cracow
	floor
		correlation: 0.037
		p_value: 0.002
	price
		correlation: 1.0
		p_value: 0.0
	rooms
		correlation: 0.485
		p_value: 0.0
	sq
		correlation: 0.728
		p_value: 0.0
	year
		correlation: -0.062
		p_value: 0.0
warsaw
	floor
		correlation: 0.038
		p_value: 0.001
	price
		correlation: 1.0
		p_value: 0.0
	rooms
		correlation: 0.522
		p_value: 0.0
	sq
		correlation: 0.774
		p_value: 0.0
	year
		correlation: 0.038
		p_value: 0.001
poznan
	floor
		correlation: 0.008
		p_value: 0.678
	price
		correlation: 1.0
		p_value: 0.0
	rooms
		correlation: 0.66
		p_value: 0.0
	sq
		correlation: 0.834
		p_value: 0.0
	year
		correlation: 0.058
		p_value: 0.001


## Build a Model

$$y\approx\overline{y}+\beta_1\left(x_1-\overline{x_1}\right)+\beta_2\left(x_2-\overline{x_2}\right),$$
where $\beta_i=\frac{\operatorname{cov}(y,x_i)\sigma^2_{x_j}-\operatorname{cov}(y,x_j)\operatorname{cov}(x_i,x_j)}{\sigma^2_{x_i}\sigma^2_{x_j}-\left(\operatorname{cov}(x_i,x_j)\right)^2}$

In [29]:
from math import sqrt
from numpy import cov

covariance = lambda x, y: cov(x, y)[1, 0]

class LinearModel:
    
    def __init__(self):
        
        self.b_sq = 0
        self.b_rooms = 0
        
        self.mean_price = 0
        self.mean_sq = 0
        self.mean_rooms = 0
        
        self.error = 0
    
    def fit(self, price, sq, rooms):
        
        self.mean_price = statistics.mean(price)
        self.mean_sq = statistics.mean(sq)
        self.mean_rooms = statistics.mean(rooms)
        
        denominator = statistics.variance(sq) * statistics.variance(rooms) - covariance(sq, rooms) ** 2
        self.b_sq = (covariance(price, sq) * statistics.variance(rooms) - 
                     covariance(price, rooms) * covariance(sq, rooms)) / denominator
        self.b_rooms = (covariance(price, rooms) * statistics.variance(sq) - 
                        covariance(price, sq) * covariance(sq, rooms)) / denominator
        
        self.error = self.test(price, sq, rooms)
        
        return self.error
        
    def predict(self, sq, rooms):
        
        return self.mean_price + self.b_sq * (sq - self.mean_sq) + self.b_rooms * (rooms - self.mean_rooms)
        
    def predict_interval(self, sq, rooms):
        
        mean_point = self.predict(sq, rooms)
        
        return dict(mean=mean_point, interval=(mean_point - self.error, mean_point + self.error))
    
    def test(self, price, sq, rooms):
        
        return sqrt(statistics.mean([(price[i] - self.predict(sq[i], rooms[i])) ** 2 for i in range(len(price))]))

## Evaluate

In [30]:
def model_summary(data):
    
    model = LinearModel()
    print(f'\t{model.fit(data["price"], data["sq"], data["rooms"])}')
    
apply(model_summary)

cracow
	131364.02217395458
warsaw
	186861.67959449024
poznan
	69508.80303326121


In [31]:
for city in cities:
    
    data = full_data[city]
    
    model = LinearModel()
    model.fit(data["price"], data["sq"], data["rooms"])
    
    with open(f'data\\{city}_test.json', 'r') as test:
        test_data = json.load(test)
    print(f"{city}: {model.test(test_data['price'], test_data['sq'], test_data['rooms'])}")

cracow: 241446.8669676865
warsaw: 437723.3109684522
poznan: 120849.06500859062


# Answer Questions

In [32]:
def cost(sq, rooms):

    def summary(data):

        model = LinearModel()
        model.fit(data["price"], data["sq"], data["rooms"])
        print(f'\t{model.predict_interval(sq, rooms)}')
    
    apply(summary)

In [33]:
cost(50, 2)

cracow
	{'mean': 532068.3765148105, 'interval': (400704.3543408559, 663432.398688765)}
warsaw
	{'mean': 614783.3889145994, 'interval': (427921.7093201092, 801645.0685090896)}
poznan
	{'mean': 401403.2667904576, 'interval': (331894.4637571964, 470912.0698237188)}
