# Technical Requirements

- Python (preferably 3.9)
- Jupyter
- Anaconda
- git (just for your convenience)
- Stable internet connection

# Anaconda Walkthrough

[![anaconda](media\anaconda.png)](https://www.anaconda.com/products/distribution)

# How to Get Started?

## Problem Definition

<img src="media\student.png"/>

### How to choose proper place to rent an apartment as a student in Poland?

![learning](media\learning.png)

## The Most Important... DATA

[![kaggle](media\kaggle.png)](https://www.kaggle.com/datasets/dawidcegielski/house-prices-in-poland)

In [34]:
import os
import json

city = 'warsaw'

In [35]:
with open(os.path.join('data', f'{city}.json'), 'r') as city_json:
    data = json.load(city_json)

In [36]:
columns = data.keys()
columns

dict_keys(['floor', 'price', 'rooms', 'sq', 'year'])

## Data Quality 

In [37]:
for column, values in data.items():
    print(f'{column}: {len(values)}')

floor: 8865
price: 8865
rooms: 8865
sq: 8865
year: 8865


In [38]:
from numpy import isnan

nans = lambda values: sum([isnan(value) for value in values])

def nans_summary(data):
    for column, values in data.items():
        print(f'{column}: {nans(values)}')

In [39]:
nans_summary(data)

floor: 0
price: 835
rooms: 0
sq: 0
year: 0


In [40]:
negatives = lambda values: sum([value < 0 for value in values])
  
def negatives_summary(data):
    for column, values in data.items():
        print(f'{column}: {negatives(values)}')

In [41]:
negatives_summary(data)

floor: 0
price: 0
rooms: 0
sq: 0
year: 0


In [42]:
time_summary = lambda data: print(f"min year: {min(data['year'])}\nmax year: {max(data['year'])}")

In [43]:
time_summary(data)

min year: 75
max year: 2980


## Data Processing

In [44]:
nan_index = [i for i, price in enumerate(data['price']) if isnan(price)]
for i, index in enumerate(nan_index):
    for values in data.values():
        values.pop(index - i)

In [45]:
nans_summary(data)

floor: 0
price: 0
rooms: 0
sq: 0
year: 0


In [46]:
future_index = [i for i, year in enumerate(data['year']) if year > 2022]
for i, index in enumerate(future_index):
    for values in data.values():
        values.pop(index - i)

In [47]:
time_summary(data)

min year: 75
max year: 2022


## Get Insights

In [48]:
import statistics

In [49]:
for column, values in data.items():
    print(column)
    for stat in (statistics.mean, statistics.median, statistics.stdev):
        print(f'\t{stat.__name__}:', round(stat(values), 3))

floor
	mean: 3.284
	median: 3
	stdev: 2.815
price
	mean: 781558.535
	median: 595000.0
	stdev: 706680.291
rooms
	mean: 2.632
	median: 3
	stdev: 1.009
sq
	mean: 63.108
	median: 54.39
	stdev: 104.903
year
	mean: 1995.974
	median: 2008
	stdev: 42.18


In [50]:
def quartiles(data, column):
        
    for i, bucket in enumerate(statistics.quantiles(data[column])):
        print(f'\t{round((i + 1) / 4 * 100)}%: {round(bucket, 3)}')

In [51]:
quartiles(data, 'price')

	25%: 468750.0
	50%: 595000.0
	75%: 820000.0


In [52]:
quartiles(data, 'sq')

	25%: 43.0
	50%: 54.39
	75%: 71.0


In [53]:
boundaries = lambda data, column: print(f"min: {min(data[column])}\nmax: {max(data[column])}")

In [54]:
boundaries(data, 'price')

min: 5000.0
max: 15000000.0


In [55]:
boundaries(data, 'sq')

min: 11.0
max: 9000.0


In [56]:
def delete_outliers(data, column, percent=.1):

    percent /= 2
    n = len(data[column])
    low, high = sorted(data[column])[int(n*percent)], sorted(data[column])[-int(n*percent)]
    to_delete = [i for i in range(len(data[column])) if data[column][i] > high or data[column][i] < low]
    for i, delete_index, in enumerate(to_delete):
        for values in data.values():
            values.pop(delete_index - i)

In [57]:
delete_outliers(data, 'price')

In [58]:
boundaries(data, 'price')

min: 330000.0
max: 1802400.0


In [59]:
delete_outliers(data, 'sq')

In [60]:
boundaries(data, 'sq')

min: 30.9
max: 104.0


# What Next?

## Feature Engineering

In [61]:
from scipy.stats import pearsonr

for column, values in data.items():
    print(f"{column}")
    results = pearsonr(data['price'], values)
    print(f'\tcorrelation: {round(results[0], 3)}')
    print(f'\tp_value: {round(results[1], 3)}')

floor
	correlation: 0.03
	p_value: 0.017
price
	correlation: 1.0
	p_value: 0.0
rooms
	correlation: 0.413
	p_value: 0.0
sq
	correlation: 0.716
	p_value: 0.0
year
	correlation: 0.015
	p_value: 0.239


## Build a Model

$$y\approx\overline{y}+\beta_1\left(x_1-\overline{x_1}\right)+\beta_2\left(x_2-\overline{x_2}\right),$$
where $\beta_i=\frac{\operatorname{cov}(y,x_i)\sigma^2_{x_j}-\operatorname{cov}(y,x_j)\operatorname{cov}(x_i,x_j)}{\sigma^2_{x_i}\sigma^2_{x_j}-\left(\operatorname{cov}(x_i,x_j)\right)^2}$

In [62]:
from math import sqrt
from numpy import cov

covariance = lambda x, y: cov(x, y)[1, 0]

class LinearModel:
    
    def __init__(self):
        
        self.b_sq = 0
        self.b_rooms = 0
        
        self.mean_price = 0
        self.mean_sq = 0
        self.mean_rooms = 0
        
        self.error = 0
    
    def fit(self, price, sq, rooms):
        
        self.mean_price = statistics.mean(price)
        self.mean_sq = statistics.mean(sq)
        self.mean_rooms = statistics.mean(rooms)
        
        denominator = statistics.variance(sq) * statistics.variance(rooms) - covariance(sq, rooms) ** 2
        self.b_sq = (covariance(price, sq) * statistics.variance(rooms) - 
                     covariance(price, rooms) * covariance(sq, rooms)) / denominator
        self.b_rooms = (covariance(price, rooms) * statistics.variance(sq) - 
                        covariance(price, sq) * covariance(sq, rooms)) / denominator
        
        self.error = self.test(price, sq, rooms)
        
        return self.error
        
    def predict(self, sq, rooms):
        
        return self.mean_price + self.b_sq * (sq - self.mean_sq) + self.b_rooms * (rooms - self.mean_rooms)
        
    def predict_interval(self, sq, rooms):
        
        mean_point = self.predict(sq, rooms)
        
        return dict(mean=mean_point, interval=(mean_point - self.error, mean_point + self.error))
    
    def test(self, price, sq, rooms):
        
        return sqrt(statistics.mean([(price[i] - self.predict(sq[i], rooms[i])) ** 2 for i in range(len(price))]))

## Evaluate

In [63]:
model = LinearModel()
model.fit(data["price"], data["sq"], data["rooms"])

168593.4474311219

In [64]:
with open(f'data\\{city}_test.json', 'r') as test:
    test_data = json.load(test)

model.test(test_data['price'], test_data['sq'], test_data['rooms'])

426581.59150050476

# Answer Questions

In [65]:
def cost(data, sq, rooms):

    model = LinearModel()
    model.fit(data["price"], data["sq"], data["rooms"])
    
    return model.predict_interval(sq, rooms)

In [66]:
cost(data, 50, 2)

{'mean': 613508.4555447036,
 'interval': (444915.00811358175, 782101.9029758256)}