# Technical Requirements

- Python (preferably 3.9)
- Jupyter
- Anaconda
- git (just for your convenience)
- Stable internet connection

# Anaconda Walkthrough

[![anaconda](media\anaconda.png)](https://www.anaconda.com/products/distribution)

# How to Get Started?

## Problem Definition

<img src="media\student.png"/>

### How to choose proper place to rent an apartment as a student in Poland?

![learning](media\learning.png)

## The Most Important... DATA

[![kaggle](media\kaggle.png)](https://www.kaggle.com/datasets/dawidcegielski/house-prices-in-poland)

In [67]:
import os
import json

city = 'poznan'

In [68]:
with open(os.path.join('data', f'{city}.json'), 'r') as city_json:
    data = json.load(city_json)

In [69]:
columns = data.keys()
columns

dict_keys(['floor', 'price', 'rooms', 'sq', 'year'])

## Data Quality 

In [70]:
for column, values in data.items():
    print(f'{column}: {len(values)}')

floor: 3734
price: 3734
rooms: 3734
sq: 3734
year: 3734


In [71]:
from numpy import isnan

nans = lambda values: sum([isnan(value) for value in values])

def nans_summary(data):
    for column, values in data.items():
        print(f'{column}: {nans(values)}')

In [72]:
nans_summary(data)

floor: 0
price: 366
rooms: 0
sq: 0
year: 0


In [73]:
negatives = lambda values: sum([value < 0 for value in values])
  
def negatives_summary(data):
    for column, values in data.items():
        print(f'{column}: {negatives(values)}')

In [74]:
negatives_summary(data)

floor: 0
price: 0
rooms: 0
sq: 0
year: 0


In [75]:
time_summary = lambda data: print(f"min year: {min(data['year'])}\nmax year: {max(data['year'])}")

In [76]:
time_summary(data)

min year: 70
max year: 2023


## Data Processing

In [77]:
nan_index = [i for i, price in enumerate(data['price']) if isnan(price)]
for i, index in enumerate(nan_index):
    for values in data.values():
        values.pop(index - i)

In [78]:
nans_summary(data)

floor: 0
price: 0
rooms: 0
sq: 0
year: 0


In [79]:
future_index = [i for i, year in enumerate(data['year']) if year > 2022]
for i, index in enumerate(future_index):
    for values in data.values():
        values.pop(index - i)

In [80]:
time_summary(data)

min year: 70
max year: 2022


## Get Insights

In [81]:
import statistics

In [82]:
for column, values in data.items():
    print(column)
    for stat in (statistics.mean, statistics.median, statistics.stdev):
        print(f'\t{stat.__name__}:', round(stat(values), 3))

floor
	mean: 2.316
	median: 2.0
	stdev: 1.982
price
	mean: 466891.429
	median: 429180.05
	stdev: 193247.907
rooms
	mean: 2.659
	median: 3.0
	stdev: 0.98
sq
	mean: 61.785
	median: 55.05
	stdev: 140.358
year
	mean: 2003.285
	median: 2020.0
	stdev: 65.306


In [83]:
def quartiles(data, column):
        
    for i, bucket in enumerate(statistics.quantiles(data[column])):
        print(f'\t{round((i + 1) / 4 * 100)}%: {round(bucket, 3)}')

In [84]:
quartiles(data, 'price')

	25%: 346985.5
	50%: 429180.05
	75%: 530084.85


In [85]:
quartiles(data, 'sq')

	25%: 43.0
	50%: 55.05
	75%: 69.2


In [86]:
boundaries = lambda data, column: print(f"min: {min(data[column])}\nmax: {max(data[column])}")

In [87]:
boundaries(data, 'price')

min: 79900.0
max: 2290000.0


In [88]:
boundaries(data, 'sq')

min: 20.0
max: 8065.0


In [89]:
def delete_outliers(data, column, percent=.1):

    percent /= 2
    n = len(data[column])
    low, high = sorted(data[column])[int(n*percent)], sorted(data[column])[-int(n*percent)]
    to_delete = [i for i in range(len(data[column])) if data[column][i] > high or data[column][i] < low]
    for i, delete_index, in enumerate(to_delete):
        for values in data.values():
            values.pop(delete_index - i)

In [90]:
delete_outliers(data, 'price')

In [91]:
boundaries(data, 'price')

min: 260026.0
max: 821105.0


In [92]:
delete_outliers(data, 'sq')

In [93]:
boundaries(data, 'sq')

min: 32.68
max: 90.99


# What Next?

## Feature Engineering

In [94]:
from scipy.stats import pearsonr

for column, values in data.items():
    print(f"{column}")
    results = pearsonr(data['price'], values)
    print(f'\tcorrelation: {round(results[0], 3)}')
    print(f'\tp_value: {round(results[1], 3)}')

floor
	correlation: 0.021
	p_value: 0.262
price
	correlation: 1.0
	p_value: 0.0
rooms
	correlation: 0.586
	p_value: 0.0
sq
	correlation: 0.793
	p_value: 0.0
year
	correlation: 0.09
	p_value: 0.0


## Build a Model

$$y\approx\overline{y}+\beta_1\left(x_1-\overline{x_1}\right)+\beta_2\left(x_2-\overline{x_2}\right),$$
where $\beta_i=\frac{\operatorname{cov}(y,x_i)\sigma^2_{x_j}-\operatorname{cov}(y,x_j)\operatorname{cov}(x_i,x_j)}{\sigma^2_{x_i}\sigma^2_{x_j}-\left(\operatorname{cov}(x_i,x_j)\right)^2}$

In [95]:
from math import sqrt
from numpy import cov

covariance = lambda x, y: cov(x, y)[1, 0]

class LinearModel:
    
    def __init__(self):
        
        self.b_sq = 0
        self.b_rooms = 0
        
        self.mean_price = 0
        self.mean_sq = 0
        self.mean_rooms = 0
        
        self.error = 0
    
    def fit(self, price, sq, rooms):
        
        self.mean_price = statistics.mean(price)
        self.mean_sq = statistics.mean(sq)
        self.mean_rooms = statistics.mean(rooms)
        
        denominator = statistics.variance(sq) * statistics.variance(rooms) - covariance(sq, rooms) ** 2
        self.b_sq = (covariance(price, sq) * statistics.variance(rooms) - 
                     covariance(price, rooms) * covariance(sq, rooms)) / denominator
        self.b_rooms = (covariance(price, rooms) * statistics.variance(sq) - 
                        covariance(price, sq) * covariance(sq, rooms)) / denominator
        
        self.error = self.test(price, sq, rooms)
        
        return self.error
        
    def predict(self, sq, rooms):
        
        return self.mean_price + self.b_sq * (sq - self.mean_sq) + self.b_rooms * (rooms - self.mean_rooms)
        
    def predict_interval(self, sq, rooms):
        
        mean_point = self.predict(sq, rooms)
        
        return dict(mean=mean_point, interval=(mean_point - self.error, mean_point + self.error))
    
    def test(self, price, sq, rooms):
        
        return sqrt(statistics.mean([(price[i] - self.predict(sq[i], rooms[i])) ** 2 for i in range(len(price))]))

## Evaluate

In [96]:
model = LinearModel()
model.fit(data["price"], data["sq"], data["rooms"])

67127.35491032449

In [97]:
with open(f'data\\{city}_test.json', 'r') as test:
    test_data = json.load(test)

model.test(test_data['price'], test_data['sq'], test_data['rooms'])

118530.4871050619

# Answer Questions

In [98]:
def cost(data, sq, rooms):

    model = LinearModel()
    model.fit(data["price"], data["sq"], data["rooms"])
    
    return model.predict_interval(sq, rooms)

In [99]:
cost(data, 50, 2)

{'mean': 402527.50700638077,
 'interval': (335400.15209605626, 469654.86191670527)}