In [100]:
import numpy as np
from scipy import stats
from lib.intervals import get_interval_borders

In [101]:
from lib.numpy_moments import cent_moment_k

# **ЛР 4.2**

In [102]:
ages = []
with open('moscow_2021.txt', 'r') as f:
    for l in f.readlines():
        ages.append(int(l.strip()))

## ***Функции***

### _Дисперсии_

In [103]:
def get_interval_means(
        nums_values: np.ndarray,
        nums_freqs: np.ndarray,
        borders: list
    ) -> tuple:
    '''`nums_values` should be sorted'''

    inter_means = []
    inter_freqs = []
    
    ind_bord = 0
    curr_total=0
    curr_freq=0
    for v, f in zip(nums_values, nums_freqs):
        if borders[ind_bord+1]<=v and (ind_bord+1)<len(borders)-1:
            inter_means.append(0 if curr_freq==0 else curr_total/curr_freq)
            inter_freqs.append(curr_freq)
            curr_total=0
            curr_freq=0
            ind_bord+=1
        
        curr_total += v*f
        curr_freq += f
    
    inter_means.append(0 if curr_freq==0 else curr_total/curr_freq)
    inter_freqs.append(curr_freq)

    return np.array(inter_means), np.array(inter_freqs)

In [104]:
def get_interval_disperse(
        nums_values: np.ndarray,
        nums_freqs: np.ndarray,
        borders: list,
        nums_inter_means: np.ndarray,
        nums_inter_freqs: np.ndarray
    ) -> np.ndarray:
    '''`nums_values` should be sorted'''
    
    inter_disper = []
    
    curr_cent_total = 0
    ind_bord = 0
    for v, f in zip(nums_values, nums_freqs):
        if borders[ind_bord+1]<=v and (ind_bord+1)<len(borders)-1:
            inter_disper.append(0 if nums_inter_freqs[ind_bord]==0 else curr_cent_total/nums_inter_freqs[ind_bord])
            curr_cent_total=0
            ind_bord+=1
        
        curr_cent_total += (v-nums_inter_means[ind_bord])**2*f
    
    inter_disper.append(0 if nums_inter_freqs[ind_bord]==0 else curr_cent_total/nums_inter_freqs[ind_bord])

    return np.array(inter_disper)

In [105]:
def get_interval_disperse_both(
        nums_values: np.ndarray,
        nums_freqs: np.ndarray,
        borders: list
) -> tuple:
    '''nums_values should be sorted
    returns tuple(ingroup, betwgroup)'''

    nums_inter_means, nums_inter_freqs = get_interval_means(
        nums_values,
        nums_freqs,
        borders
    )

    nums_inter_disperse = get_interval_disperse(
        nums_values,
        nums_freqs,
        borders,
        nums_inter_means,
        nums_inter_freqs 
    )

    nums_size = np.sum(nums_inter_freqs)

    nums_mean = np.sum(nums_inter_means*nums_inter_freqs)/nums_size

    disperse_ingroup = np.sum(nums_inter_disperse*nums_inter_freqs)/nums_size
    disperse_betwgroup = np.sum((nums_inter_means - nums_mean)**2 *nums_inter_freqs)/nums_size
    return disperse_ingroup, disperse_betwgroup

### _Корреляционное отношение_

In [106]:
def corr_relation(nums_values: np.ndarray, nums_freqs: np.ndarray, bords: list):
    disperse_ingroup, disperse_betwgroup = get_interval_disperse_both(
        nums_values,
        nums_freqs,
        bords
    )

    return (disperse_betwgroup/(disperse_betwgroup+disperse_ingroup))**0.5


### _Коэффициент ранговой корреляции Спирмена_

In [107]:
def coef_spearman(nums_values: np.ndarray, nums_freqs: np.ndarray) -> float:
    nums_values_ranked = stats.rankdata(nums_values)
    nums_freqs_ranked = stats.rankdata(nums_freqs)
    rank_diff = nums_values_ranked-nums_freqs_ranked

    n = nums_values.size
    return 1-6*np.sum(rank_diff**2)/(n*(n**2 - 1))

## ***Ход работы***

In [108]:
delta_ages = 9
age_bords = get_interval_borders(min(ages), max(ages), delta_ages)
ages_size = len(ages)

ages_array = np.array(ages)
age_values, age_freqs = np.unique(ages_array, return_counts=True)


age_disperse_ingroup, age_disperse_betwgroup = get_interval_disperse_both(age_values, age_freqs, age_bords)
age_disperse = age_disperse_ingroup+age_disperse_betwgroup

print(f"D_in = {age_disperse_ingroup:.3f}")
print(f"D_betw = {age_disperse_betwgroup:.3f}")
print(f"D_in + D_betw = {age_disperse:.3f}")
print(f"D(ages) = {cent_moment_k(ages_array, 2):.3f}")

D_in = 6.140
D_betw = 138.777
D_in + D_betw = 144.917
D(ages) = 144.917


In [109]:
ages_corr_relation = corr_relation(age_values, age_freqs, age_bords)

print(f"eta = {ages_corr_relation:.3f}")

eta = 0.979


In [110]:
ages_spearman = coef_spearman(age_values, age_freqs)

print(f"R = {ages_spearman:.3f}")

R = -0.525
