# Technical Requirements

- Python (preferably 3.9)
- Jupyter
- Anaconda
- git (just for you convenience)
- Stable internet connection

# Anaconda Walkthrough

[![anaconda](media\anaconda.png)](https://www.anaconda.com/products/distribution)

# How to Get Started?

## Problem Definition

<img src="media\student.png"/>

### How to choose proper place to rent an apartment as a student in Poland?

## The Most Important... DATA

[![kaggle](media\kaggle.png)](https://www.kaggle.com/datasets/dawidcegielski/house-prices-in-poland)

In [1]:
import os
import json

cities = ['cracow', 'warsaw', 'poznan']

for city in cities:
    with open(os.path.join('data', f'{city}.json'), 'r') as city_json:
        exec(f'{city} = json.load(city_json)')

In [2]:
columns = warsaw.keys()
columns

dict_keys(['floor', 'price', 'rooms', 'sq', 'year'])

## Data Quality 

In [3]:
full_data = {city: eval(city) for city in cities}

def apply(method):
    
    for city, data in full_data.items():
        print(city)
        method(data)

In [4]:
def count(data):
    
    for column, values in data.items():
        print(f'\t{column}: {len(values)}')
        
apply(count)

cracow
	floor: 9764
	price: 9764
	rooms: 9764
	sq: 9764
	year: 9764
warsaw
	floor: 9851
	price: 9851
	rooms: 9851
	sq: 9851
	year: 9851
poznan
	floor: 4149
	price: 4149
	rooms: 4149
	sq: 4149
	year: 4149


In [5]:
from numpy import isnan

nans = lambda values: sum([isnan(value) for value in values])

def nan_raport(relative=True):
    
    def nan_summary(data):
        for column, values in data.items():
            if relative:
                print(f'\t{column}: {round(nans(values) / len(values), 3)}')
            else:
                print(f'\t{column}: {nans(values)}')
                
    apply(nan_summary)

In [6]:
nan_raport()

cracow
	floor: 0.0
	price: 0.095
	rooms: 0.0
	sq: 0.0
	year: 0.0
warsaw
	floor: 0.0
	price: 0.094
	rooms: 0.0
	sq: 0.0
	year: 0.0
poznan
	floor: 0.0
	price: 0.097
	rooms: 0.0
	sq: 0.0
	year: 0.0


In [7]:
negatives = lambda values: sum([value < 0 for value in values])

def sign_raport(relative=True):
    
    def negatives_summary(data):
        for column, values in data.items():
            if relative:
                print(f'\t{column}: {round(negatives(values) / len(values), 3)}')
            else:
                print(f'\t{column}: {negatives(values)}')
    
    apply(negatives_summary)

In [8]:
sign_raport()

cracow
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
warsaw
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
poznan
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0


In [9]:
def time_raport():

    apply(lambda data: print(f"\tmin year: {min(data['year'])}\n\tmax year: {max(data['year'])}"))

In [10]:
time_raport()

cracow
	min year: 200
	max year: 2023
warsaw
	min year: 75
	max year: 2980
poznan
	min year: 70
	max year: 2023


## Data Processing

In [11]:
for city in cities:
    nan_index = [i for i, price in enumerate(full_data[city]['price']) if isnan(price)]
    for i, index in enumerate(nan_index):
        for values in full_data[city].values():
            values.pop(index - i)

In [12]:
nan_raport()

cracow
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
warsaw
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0
poznan
	floor: 0.0
	price: 0.0
	rooms: 0.0
	sq: 0.0
	year: 0.0


In [13]:
for city in cities:
    future_index = [i for i, year in enumerate(full_data[city]['year']) if year > 2022]
    for i, index in enumerate(future_index):
        for values in full_data[city].values():
            values.pop(index - i)

In [14]:
time_raport()

cracow
	min year: 200
	max year: 2022
warsaw
	min year: 75
	max year: 2022
poznan
	min year: 70
	max year: 2022


## Get Insights

In [15]:
import statistics

In [16]:
def get_stats(*args):
    
    def stats(data):
        for column, values in data.items():
            print(f'\t{column}')
            for stat in args:
                print(f'\t\t{stat}:', round(eval(f'statistics.{stat}({values})'), 3))
    
    apply(stats)

In [17]:
get_stats('mean', 'median', 'stdev')

cracow
	floor
		mean: 2.543
		median: 2
		stdev: 2.178
	price
		mean: 596748.177
		median: 501381.0
		stdev: 374950.035
	rooms
		mean: 2.603
		median: 3
		stdev: 0.989
	sq
		mean: 172.293
		median: 53.0
		stdev: 10741.539
	year
		mean: 2003.584
		median: 2020
		stdev: 41.186
warsaw
	floor
		mean: 3.277
		median: 3
		stdev: 2.808
	price
		mean: 779885.905
		median: 595000.0
		stdev: 704113.959
	rooms
		mean: 2.628
		median: 3
		stdev: 1.008
	sq
		mean: 62.825
		median: 54.3
		stdev: 99.984
	year
		mean: 1995.884
		median: 2008
		stdev: 45.092
poznan
	floor
		mean: 2.326
		median: 2
		stdev: 1.975
	price
		mean: 467414.007
		median: 429000.0
		stdev: 195297.937
	rooms
		mean: 2.661
		median: 3
		stdev: 0.98
	sq
		mean: 61.565
		median: 55.21
		stdev: 133.288
	year
		mean: 2003.343
		median: 2020
		stdev: 62.796


In [18]:
def quantiles(column, buckets):
    
    def quantiles_summary(data):
        
        for i, bucket in enumerate(statistics.quantiles(data[column], n=buckets)):
            print(f'\t{round((i + 1) / buckets * 100, 2)}%: {round(bucket, 2)}')
            
    apply(quantiles_summary)

In [19]:
quantiles('price', 4)

cracow
	25.0%: 408672.0
	50.0%: 501381.0
	75.0%: 668390.0
warsaw
	25.0%: 467456.0
	50.0%: 595000.0
	75.0%: 822375.0
poznan
	25.0%: 347800.0
	50.0%: 429000.0
	75.0%: 532200.0


In [20]:
quantiles('sq', 10)

cracow
	10.0%: 33.49
	20.0%: 38.2
	30.0%: 44.21
	40.0%: 48.8
	50.0%: 53.0
	60.0%: 58.2
	70.0%: 63.71
	80.0%: 71.0
	90.0%: 84.48
warsaw
	10.0%: 33.2
	20.0%: 39.41
	30.0%: 45.85
	40.0%: 49.98
	50.0%: 54.3
	60.0%: 60.0
	70.0%: 66.3
	80.0%: 77.0
	90.0%: 97.7
poznan
	10.0%: 33.08
	20.0%: 40.01
	30.0%: 46.06
	40.0%: 50.67
	50.0%: 55.21
	60.0%: 61.05
	70.0%: 66.51
	80.0%: 73.91
	90.0%: 87.22


In [21]:
def boundary_raport(variable):

    apply(lambda data: print(f"\tmin: {min(data[variable])}\n\tmax: {max(data[variable])}"))

In [22]:
boundary_raport('sq')

cracow
	min: 12.0
	max: 1007185.0
warsaw
	min: 11.0
	max: 9000.0
poznan
	min: 13.3
	max: 8065.0


In [23]:
boundary_raport('price')

cracow
	min: 83000.0
	max: 6400000.0
warsaw
	min: 5000.0
	max: 15000000.0
poznan
	min: 29900.0
	max: 2290000.0


In [24]:
def delete_outliers(data, percent=.1):

    percent /= 4
    for variable in ['sq', 'price']:
        n = len(data[variable])
        low, high = sorted(data[variable])[int(n*percent)], sorted(data[variable])[-int(n*percent)]
        to_delete = [i for i in range(len(data[variable])) if data[variable][i] > high or data[variable][i] < low]
        for i, delete_index, in enumerate(to_delete):
            for values in data.values():
                values.pop(delete_index - i)
    
apply(delete_outliers)

cracow
warsaw
poznan


In [31]:
boundary_raport('sq')

cracow
	min: 25.23
	max: 122.7
warsaw
	min: 25.0
	max: 150.0
poznan
	min: 28.1
	max: 120.0


In [32]:
boundary_raport('price')

cracow
	min: 279202.0
	max: 1257250.0
warsaw
	min: 320000.0
	max: 1993000.0
poznan
	min: 254975.0
	max: 870000.0


# What Next?

## Feature Engineering

In [25]:
from scipy.stats import pearsonr


def corr(variable, transform = lambda x: x):
    
    def corr_summary(data):
        main_values = [transform(value) for value in data[variable]]
        for column, values in data.items():
            print(f"\t{column}")
            results = pearsonr(main_values, values)
            print(f'\t\tcorrelation: {round(results[0], 3)}')
            print(f'\t\tp_value: {round(results[1], 3)}')
    
    apply(corr_summary)

In [26]:
corr('price')

cracow
	floor
		correlation: 0.041
		p_value: 0.0
	price
		correlation: 1.0
		p_value: 0.0
	rooms
		correlation: 0.486
		p_value: 0.0
	sq
		correlation: 0.725
		p_value: 0.0
	year
		correlation: -0.058
		p_value: 0.0
warsaw
	floor
		correlation: 0.036
		p_value: 0.001
	price
		correlation: 1.0
		p_value: 0.0
	rooms
		correlation: 0.523
		p_value: 0.0
	sq
		correlation: 0.775
		p_value: 0.0
	year
		correlation: 0.031
		p_value: 0.005
poznan
	floor
		correlation: -0.002
		p_value: 0.894
	price
		correlation: 1.0
		p_value: 0.0
	rooms
		correlation: 0.662
		p_value: 0.0
	sq
		correlation: 0.838
		p_value: 0.0
	year
		correlation: 0.058
		p_value: 0.001


## Build a Model

$$y\approx\overline{y}+\beta_1\left(x_1-\overline{x_1}\right)+\beta_2\left(x_2-\overline{x_2}\right),$$
where $\beta_i=\frac{\operatorname{cov}(y,x_i)\sigma^2_{x_j}-\operatorname{cov}(y,x_j)\operatorname{cov}(x_i,x_j)}{\sigma^2_{x_i}\sigma^2_{x_j}-\left(\operatorname{cov}(x_i,x_j)\right)^2}$

In [27]:
from math import sqrt, log, exp
from numpy import cov

covariance = lambda x, y: cov(x, y)[1, 0]

class LinearModel:
    
    def __init__(self):
        
        self.b_sq = 0
        self.b_rooms = 0
        
        self.mean_price = 0
        self.mean_sq = 0
        self.mean_rooms = 0
        
        self.error = 0
    
    def fit(self, price, sq, rooms):
        
        log_price = [log(p) for p in price]
        self.mean_price = statistics.mean(log_price)
        self.mean_sq = statistics.mean(sq)
        self.mean_rooms = statistics.mean(rooms)
        
        denominator = statistics.variance(sq) * statistics.variance(rooms) - covariance(sq, rooms) ** 2
        self.b_sq = (covariance(log_price, sq) * statistics.variance(rooms) - 
                     covariance(log_price, rooms) * covariance(sq, rooms)) / denominator
        self.b_rooms = (covariance(log_price, rooms) * statistics.variance(sq) - 
                        covariance(log_price, sq) * covariance(sq, rooms)) / denominator
        
        self.error = sqrt(statistics.mean([(price[i] - self.predict(sq[i], rooms[i])) ** 2 for i in range(len(price))]))
        
        return self.error
        
    def predict(self, sq, rooms):
        
        return exp(self.mean_price + self.b_sq * (sq - self.mean_sq) + self.b_rooms * (rooms - self.mean_rooms))
        
    def predict_interval(self, sq, rooms):
        
        mean_point = self.predict(sq, rooms)
        
        return dict(mean=mean_point, interval=(mean_point - self.error, mean_point + self.error))

## Evaluate

In [28]:
def model_summary(data):
    
    model = LinearModel()
    print(f'\t{model.fit(data["price"], data["sq"], data["rooms"])}')
    
apply(model_summary)

cracow
	132951.62462744158
warsaw
	202825.78651233946
poznan
	71780.2031742544


# Answer Questions

In [29]:
def cost(sq, rooms):

    def summary(data):

        model = LinearModel()
        model.fit(data["price"], data["sq"], data["rooms"])
        print(f'\t{model.predict_interval(sq, rooms)}')
    
    apply(summary)

In [30]:
cost(50, 2)

cracow
	{'mean': 501505.65268695034, 'interval': (368554.02805950877, 634457.2773143919)}
warsaw
	{'mean': 571387.8918750541, 'interval': (368562.1053627146, 774213.6783873936)}
poznan
	{'mean': 389873.2881781801, 'interval': (318093.0850039257, 461653.4913524345)}
