In [1]:
import numpy as np
import math

In [9]:
x = [6, 12, 13, 17, 22, 25, 27, 29, 30, 32]
y = [45, 47, 39, 58, 68, 76, 75, 74, 78, 81]

### Mathematical expectation
probability-weighted average of all possible outcomes of a random variable, representing the long-term average result of an experiment, calculated as the sum (for discrete) or integral (for continuous) of each outcome multiplied by its probability

It  is crucial for predicting behavior in games, finance, and complex systems

In [6]:
# 1 - If you have a list of values (assuming uniform probability)
values = [2, 4, 7, 9, 15]
expected_value = np.mean(values)
print(expected_value)

7.4


In [7]:
# 2 - If you have values with associated probabilities
values = [2, 4, 7, 9, 15]
probabilities = [0.1, 0.2, 0.3, 0.25, 0.15]  # must sum to 1

expected_value = sum(v * p for v, p in zip(values, probabilities))
print(expected_value)

7.6000000000000005


In [None]:
# 3 - Using numpy for probabilities
values = np.array([2, 4, 7, 9, 15])
probabilities = np.array([0.1, 0.2, 0.3, 0.25, 0.15])

expected_value = np.dot(values, probabilities)
print(expected_value)

7.6


### Sample mean
average value of a set of observations taken from a sample

In [8]:
def get_mean(sample):
  return sum(sample)/len(sample)

In [10]:
print(get_mean(x))
print(get_mean(y))

21.3
64.1


In [11]:
np.mean(x), np.mean(y)

(np.float64(21.3), np.float64(64.1))

### Median
middle value of a dataset when the values are arranged in order

- If there is an **odd** number of values → the median is the **middle value**.
- If there is an **even** number of values → the median is the **average of the two middle values**.

***Why it is important:***

- Not affected by extreme values
- Represents the 50th percentile
- Works well for skewed distributions
- Easy to interpret

In [12]:
def get_median(sample):
  sorted_sample = sorted(sample)
  sample_length = len(sorted_sample)
  is_odd_count = sample_length % 2 == 1

  if is_odd_count:
    return sorted_sample[sample_length//2]
  
  return (sorted_sample[sample_length//2 - 1] + sorted_sample[sample_length//2]) / 2

In [15]:
print(get_median(x))
print(get_median(y))
print(get_median([8,4,2,6,10]))

23.5
71.0
6


In [14]:
np.median(x), np.median(y)

(np.float64(23.5), np.float64(71.0))

### Variance
Measures how much the values of a dataset **spread out from the mean**

Variance is the **average of the squared differences from the mean**

***Why it is important:***

- Shows variability of data
    - High variance → data points are widely spread out
    - Low variance → data points are close to the mean
- Helps understand risk and uncertainty
- Foundation for many statistical methods
- Identifies outliers

In [16]:
def get_variance(sample, ddof=0):
  sample_mean = get_mean(sample)
  sample_minus_mean = [(s - sample_mean)**2 for s in sample]
  divisor = len(sample) - 1 if ddof else len(sample)
  return sum(sample_minus_mean)/divisor

In [18]:
print(get_variance(x))
print(get_variance(y))
print(get_variance(x, ddof=1))
print(get_variance(y, ddof=1))

70.41
217.69
78.23333333333333
241.8777777777778


In [19]:
np.var(x), np.var(x, ddof=1)

(np.float64(70.41), np.float64(78.23333333333333))

### Standard deviation
- It is the **square root of the variance**.
- Measures how **spread out the data** is around the mean.
- Intuition: the larger the standard deviation, the more dispersed the values are.

***Why it is important***

- Measures variability in the same units as the data
    - Unlike variance (squared units), standard deviation is in the same unit as the original data, making it easier to interpret.

In [20]:
def get_std(sample, ddof=0):
  return math.sqrt(
      get_variance(sample, ddof=ddof)
  )

In [22]:
get_std(x, ddof=0), get_std(x, ddof=1)

(8.391066678319271, 8.844960900610772)

In [23]:
np.std(x), np.std(x, ddof=1)

(np.float64(8.391066678319271), np.float64(8.844960900610772))

### 3-sigma rule
**Definition:**

It describes how data is distributed in a **normal (Gaussian) distribution**:

1. About **68%** of values lie within **±1σ** from the mean
2. About **95%** of values lie within **±2σ** from the mean
3. About **99.7%** of values lie within **±3σ** from the mean

Where **σ (sigma)** = standard deviation, and the **mean (μ)** is the center of the distribution.

***Why it’s important:***

- Helps identify **outliers**
    - Values beyond ±3σ are very rare (≈0.3%)
- Used in **quality control** and **process monitoring**
- Provides a quick way to **understand variability** in normally distributed data

### Covariance
measure that shows how two variables change together

**Simple explanation:**

- If two variables **increase or decrease together**, covariance is **positive**.
- If one variable **increases while the other decreases**, covariance is **negative**.
- If the variables are independent, covariance is close to **0**.

In [26]:
def get_covariance(sample_1, sample_2, ddof=0):
  sample_1_mean = get_mean(sample_1)
  sample_2_mean = get_mean(sample_2)
  samples_mult = [(sample_1[i] - sample_1_mean) * (sample_2[i] - sample_2_mean) for i in range(len(sample_1))]
  divisor = len(sample_1) - 1 if ddof else len(sample_1)
  return sum(samples_mult)/divisor

In [27]:
get_covariance(x,y,ddof=1), np.cov(x,y,ddof=1)

(130.3,
 array([[ 78.23333333, 130.3       ],
        [130.3       , 241.87777778]]))

### Сorrelation
shows how two variables change together. It is normalized version of convariation with values range from –1 to +1

#### Pearson’s correlation coefficient (r)
- The most common type of correlation, also called **Pearson’s r**.
- Measures **linear relationship** between two variables.

In [28]:
def get_corrcoef(sample_1, sample_2, ddof=0):
  cov_samples = get_covariance(sample_1, sample_2, ddof=ddof)
  std_1 = get_std(sample_1, ddof=ddof)
  std_2 = get_std(sample_2, ddof=ddof)
  return cov_samples / (std_1 * std_2)

In [29]:
get_corrcoef(x,y), np.corrcoef(x,y)

(0.9472192452662753,
 array([[1.        , 0.94721925],
        [0.94721925, 1.        ]]))