# Assignment 9.2

> Replace all TODOs with your code. Do not change any other code.

In [9]:
# Do not edit this cell

from typing import List
import math

## Descriptive statistics

In this assignment, we will write the functions to calculate the basic statistics from scratch, not using numpy.

### Task 1

Let's start simple: write a function `mean` that calculates the average of the list.

$$\mu = \frac{{\sum_{i=1}^n x_i}}{{n}}$$

In [2]:
def mean(li: List[float]) -> float:
    return sum(li) / len(li)


assert mean([1., 2., 3.]) == 2.
assert mean([1., 1., 2., 0.]) == 1.

### Task 2

Now let's calculate variance (dispersion). You may use the `mean` function implemented before.

$$V = \frac{{\sum_{i=1}^n (x_i - \mu)^2}}{{n}}$$

In [None]:
def variance(li: List[float]) -> float:
    m = mean(li)
    return sum((x - m) ** 2 for x in li) / len(li)


assert variance([1., 1., 1.]) == 0.
assert variance([1., 2., 3., 4.]) == 1.25

### Task 3

The standard deviation is easy once you get the variance:

$$\sigma = \sqrt{V}$$

In [6]:
def std(li: List[float]) -> float:
    return math.sqrt(variance(li))


assert std([1., 1., 1.]) == 0.
assert std([1., 2., 3., 4.]) == 1.25**0.5

### Task 4

**Median**

The median is the middle value in a sorted dataset. If the dataset has an odd number of values, the median is the value at the center. If the dataset has an even number of values, the median is the average of the two middle values.

In [7]:
def median(li: List[float]) -> float:
    li_sorted = sorted(li)
    
    n = len(li_sorted)
    
    if n % 2 == 1:
        return li_sorted[n // 2]

    else:
        return (li_sorted[n // 2 - 1] + li_sorted[n // 2]) / 2


assert median([1., 1., 1.]) == 1.
assert median([1., 4., 3., 2.]) == 2.5

## Measure performance

Sometimes, apart from theoretical, algorithmic complexity, it's a good idea to compare the runtime of two algorithms empirically, i.e., run the code many times and time it.

In Python's standard library, we have [timeit](https://docs.python.org/3/library/timeit.html) module that does exactly that.

Let's compare the runtime of your implementations and numpy. Use the provided setup code:

In [20]:
import timeit

setup = '''
import random
import numpy as np

arr = np.random.rand(10_000) * 100
li = [random.random() * 100 for _ in range(10_000)]
'''

def numpy_mean(arr):
    return np.mean(arr)

def numpy_variance(arr):
    return np.var(arr)

def numpy_std(arr):
    return np.std(arr)

def numpy_median(arr):
    return np.median(arr)

funcs = {
    'mean': mean,
    'variance': variance,
    'std': std,
    'median': median,
}

for name, func in funcs.items():
    time_taken = timeit.timeit(f"{name}(li)", setup=setup, globals=globals(), number=10)
    print(f"Your {name} function took {time_taken:.6f} seconds.")

numpy_funcs = {
    'numpy_mean': numpy_mean,
    'numpy_variance': numpy_variance,
    'numpy_std': numpy_std,
    'numpy_median': numpy_median,
}

for name, func in numpy_funcs.items():
    time_taken = timeit.timeit(f"{name}(arr)", setup=setup, globals=globals(), number=10)
    print(f"NumPy {name} function took {time_taken:.6f} seconds.")

Your mean function took 0.000542 seconds.
Your variance function took 0.012235 seconds.
Your std function took 0.008914 seconds.
Your median function took 0.009138 seconds.
NumPy numpy_mean function took 0.000078 seconds.
NumPy numpy_variance function took 0.000174 seconds.
NumPy numpy_std function took 0.000190 seconds.
NumPy numpy_median function took 0.000517 seconds.


### Task 5

Complete Python statements to compare your functions to numpy. Use `li` for your function and `arr` for numpy functions.

In [21]:
stmt_mean_custom = 'mean(li)'
stmt_mean_np = 'np.mean(arr)'

stmt_var_custom = 'variance(li)'
stmt_var_np = 'np.var(arr)'

stmt_std_custom = 'std(li)'
stmt_std_np = 'np.std(arr)'

stmt_median_custom = 'median(li)'
stmt_median_np = 'np.median(arr)'

### Task 6

Measure average exec time of your statements with `timeit` module. As your submission, fill out the table with results (rounded to 2 decimal places)

In [34]:
import timeit

mean_custom_time = timeit.timeit(stmt=stmt_mean_custom, setup=setup, globals=globals(), number=10_000)
mean_np_time = timeit.timeit(stmt=stmt_mean_np, setup=setup, globals=globals(), number=10_000)

var_custom_time = timeit.timeit(stmt=stmt_var_custom, setup=setup, globals=globals(), number=10_000)
var_np_time = timeit.timeit(stmt=stmt_var_np, setup=setup, globals=globals(), number=10_000)

std_custom_time = timeit.timeit(stmt=stmt_std_custom, setup=setup, globals=globals(), number=10_000)
std_np_time = timeit.timeit(stmt=stmt_std_np, setup=setup, globals=globals(), number=10_000)

median_custom_time = timeit.timeit(stmt=stmt_median_custom, setup=setup, globals=globals(), number=10_000)
median_np_time = timeit.timeit(stmt=stmt_median_np, setup=setup, globals=globals(), number=10_000)

print("Time per 10000 executions, secs")
print(f"{'Func':<10}{'Custom':<10}  {'Numpy'}")
print(f"{'mean':<10}{mean_custom_time/10_000:.8f} {mean_np_time/10_000:.8f}")
print(f"{'var':<10}{var_custom_time/10_000:.8f} {var_np_time/10_000:.8f}")
print(f"{'std':<10}{std_custom_time/10_000:.8f} {std_np_time/10_000:.8f}")
print(f"{'median':<10}{median_custom_time/10_000:.8f} {median_np_time/10_000:.8f}")

Time per 10000 executions, secs
Func      Custom      Numpy
mean      0.00002322 0.00000275
var       0.00054011 0.00001036
std       0.00053551 0.00001094
median    0.00055912 0.00003117


Time per 10000 executions, secs

| Func       | Custom | Numpy |
| ---------- | ------ | ----- |
| mean       |        |       |
| var        |        |       |
| std        |        |       |
| median     |        |       |