# Производим импорт необходимых библиотек и считывание из них данных

In [2]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)

DATA_PATH = './data/'

df = pd.read_csv(DATA_PATH + 'df.csv', index_col="Unnamed: 0")
weighted_df = pd.read_csv(DATA_PATH + 'weighted_df.csv', index_col="Unnamed: 0")

print("df shape:", df.shape)
print("weighted df shape:", weighted_df.shape)

df shape: (2000, 74)
weighted df shape: (701174, 74)


# 1. Проверим, равна ли средняя зарплата по выборке статистически равна средней номинальной начисленной зарплате (815.25)

Рассмотрев невзвешенную и взвешенной выборки, средние значения зарплат не равны 815.25 (уровень значимости теста p-value = 0). То есть можно отклонить нулевую гипотезу.

Проверим результаты теста:
    по невзвешенной выборке среднее значение зарплаты = 446 рубля
    по взвешенной выборке среднее значение зарплаты = 465 рубля

In [3]:
df_nonzero_salary = df[df["ppinc_1"] != 0]
weighted_df_nonzero_salary = weighted_df[weighted_df["ppinc_1"] != 0]

print("Non-weighted df, t-test на равенство среднего 815.25 рублям:", stats.ttest_1samp(
    df_nonzero_salary["ppinc_1"],
    815.25)
)
print("ppinc_1 mean (non-weighted):", df_nonzero_salary["ppinc_1"].mean())


print("\nWeighted df, t-test на равенство среднего 815.25 рублям:",stats.ttest_1samp(
    weighted_df_nonzero_salary["ppinc_1"],
    815.25)
)
print("ppinc_1 mean (weighted):", weighted_df_nonzero_salary["ppinc_1"].mean())

Non-weighted df, t-test на равенство среднего 815.25 рублям: Ttest_1sampResult(statistic=-47.43298848103542, pvalue=3.2931947277523885e-276)
ppinc_1 mean (non-weighted): 446.6225421229997

Weighted df, t-test на равенство среднего 815.25 рублям: Ttest_1sampResult(statistic=-784.0938921412557, pvalue=0.0)
ppinc_1 mean (weighted): 465.24533324448015


# 2. Проверка двухвыборочных критериев

Для этого необходимо провести двухвыборочный t-test и levene-тест для выборок мужчин и женщин с ненулевой зарплатой
Для всех критериев p-value = 0
Проверим результаты тестов:

    по невзвешенной выборке среднее значение зарплат у мужчин = 503 рубля
    по невзвешенной выборке среднее значение зарплат у женщин = 394 рублей

    по невзвешенной выборке стандартное отклонение зарплат мужчин = 287
    по невзвешенной выборке стандартное отклонение зарплат женщин = 237

    по взвешенной выборке среднее значение зарплаты мужчин = 488 рублей
    по взвешенной выборке среднее значение зарплаты мужчин = 443 рублей

    по взвешенной выборке стандартное отклонение зарплат мужчин = 295
    по взвешенной выборке стандартное отклонение зарплат женщин = 292

Заметим, что в нашей выборке нет равенства средних и дисперсий относительно мужчин и женщин.

In [5]:
print("t-test на равенство средних:", stats.ttest_ind(
    df_nonzero_salary[df_nonzero_salary["sex"] == "Male"]["ppinc_1"],
    df_nonzero_salary[df_nonzero_salary["sex"] == "Female"]["ppinc_1"])
)

print("levene критерий на равенство дисперсий:", stats.levene(
    df_nonzero_salary[df_nonzero_salary["sex"] == "Male"]["ppinc_1"],
    df_nonzero_salary[df_nonzero_salary["sex"] == "Female"]["ppinc_1"])
)

print("\n\tMale ppinc_1 mean (non-weighted):", df_nonzero_salary[df_nonzero_salary["sex"] == "Male"]["ppinc_1"].mean())
print("\tFemale ppinc_1 mean (non-weighted):", df_nonzero_salary[df_nonzero_salary["sex"] == "Female"]["ppinc_1"].mean())
print("\n\tMale ppinc_1 std (non-weighted):", df_nonzero_salary[df_nonzero_salary["sex"] == "Male"]["ppinc_1"].std())
print("\tFemale ppinc_1 std (non-weighted):", df_nonzero_salary[df_nonzero_salary["sex"] == "Female"]["ppinc_1"].std())

print("\nWeighted df:")
print("t-test на равенство средних:", stats.ttest_ind(
    weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Male"]["ppinc_1"],
    weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Female"]["ppinc_1"])
)

print("levene критерий на равенство дисперсий:", stats.levene(
    weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Male"]["ppinc_1"],
    weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Female"]["ppinc_1"])
)

print("\n\tMale ppinc_1 mean (weighted):", weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Male"]["ppinc_1"].mean())
print("\tFemale ppinc_1 mean (weighted):", weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Female"]["ppinc_1"].mean())
print("\n\tMale ppinc_1 std (weighted):", weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Male"]["ppinc_1"].std())
print("\tFemale ppinc_1 std (weighted):", weighted_df_nonzero_salary[weighted_df_nonzero_salary["sex"] == "Female"]["ppinc_1"].std())

t-test на равенство средних: Ttest_indResult(statistic=7.12281798837139, pvalue=1.830580794908817e-12)
levene критерий на равенство дисперсий: LeveneResult(statistic=13.677870457776738, pvalue=0.00022693345705938122)

	Male ppinc_1 mean (non-weighted): 503.5222138643062
	Female ppinc_1 mean (non-weighted): 394.9371489817789

	Male ppinc_1 std (non-weighted): 287.0626571431568
	Female ppinc_1 std (non-weighted): 237.6020438449815

Weighted df:
t-test на равенство средних: Ttest_indResult(statistic=50.10301978139058, pvalue=0.0)
levene критерий на равенство дисперсий: LeveneResult(statistic=1404.1225478080203, pvalue=8.287542945535823e-307)

	Male ppinc_1 mean (weighted): 488.2484420552434
	Female ppinc_1 mean (weighted): 443.6254048944067

	Male ppinc_1 std (weighted): 295.29740708501106
	Female ppinc_1 std (weighted): 291.86595472901007


# 3. Создание категориальных переменных

In [8]:
df["age_categorical"] = df["age"].apply(lambda x : "18-24" if (x >= 18 and x <= 24)
                                                else "25-34" if (x >= 25 and x <= 34)
                                                else "35-44" if (x >= 35 and x <= 44)
                                                else "45-54" if (x >= 45 and x <= 54)
                                                else "55-64" if (x >= 55 and x <= 64)
                                                else pd.NA)
df["salary_categorical"] = df["ppinc_1"].apply(lambda x : pd.NA if (x == 0)
                                                      else "0-400" if (x > 0 and x < 400)
                                                      else "400-500" if (x >= 400 and x < 500)
                                                      else "500-700" if (x >= 500 and x < 700)
                                                      else "700-1000" if (x >= 700 and x < 1000)
                                                      else ">= 1000")
df

Unnamed: 0,n_uhc,n_upc,year,yweight,resid,region,age,sex,ppinc_1,nummonth,educat,healthev,weight,height,sport,smoker,htype,hsize,ch0_5,ch6_12,ch13_17,elder,inc_1,inc_2,inc_3,inc_4,inc_5,inc_6,inc_7,inc_8,inc_9,inc_10,inc_11,cashinc,inkind,privlg,totalinc,exp_1,exp_2,exp_3,exp_4,exp_5,exp_6,exp_6_1,exp_6_2,exp_7,exp_8,exp_9,exp_10,exp_11,exp_12,exp_13,exp_14,exp_15,exp_16,exp_17,exp_17_1,exp_18,exp_19,exp_20,exp_21,exp_22,exp_23,exp_24,exp_25,exp_26,totalexp,hh_blint,hh_int1,hh_int2,hh_int3,hh_int4,bmi,savings,age_categorical,salary_categorical
9460,5251.0,1.0,2017.0,535,Rural,Minsk oblast,42.0,Male,759.468333,12.0,Secondary education,Good,83.0,178.0,No,Yes,2 and more adults with children,3.0,1.0,0.0,0.0,0.0,791.135000,3.250000,0.000,0.000000,268.578333,0.000000,0.0,18.621667,6.916667,0.0,0.000000,1088.501667,57.516407,0.000000,1146.018074,374.127917,41.539583,15.251667,6.382500,0.000000,53.702500,0.0,53.702500,18.576667,135.416667,11.240000,0.491667,0.000000,0.000000,37.480833,9.710833,0.000000,0.000000,20.497917,20.497917,36.109167,1.395833,1.379167,264.333333,0.00,0.000000,7.380000,0.000000,0.0,1035.016250,1.0,1.0,1.0,1.0,1.0,26.196187,111.001824,35-44,700-1000
5545,3031.0,1.0,2017.0,544,Small city,Grodno oblast,73.0,Female,0.000000,0.0,"Higher education, After Higher education",Bad,70.0,168.0,No,No,Lone adult older working age,1.0,0.0,0.0,0.0,1.0,0.000000,0.000000,0.000,416.653333,0.000000,59.708333,0.0,58.333333,0.000000,0.0,0.000000,534.695000,30.134244,5.556667,570.385910,248.321667,3.656250,15.914167,12.666667,0.000000,41.196667,0.0,26.853333,32.473333,0.000000,18.674167,2.166667,0.000000,0.000000,10.165000,25.096667,0.000000,0.000000,64.166667,64.166667,4.390000,100.500000,0.000000,0.000000,0.00,0.000000,0.000000,46.166667,0.0,625.554583,1.0,1.0,1.0,1.0,1.0,24.801587,-55.168673,,
297,162.0,1.0,2017.0,463,Large city,Brest oblast,56.0,Male,396.034167,12.0,Secondary education,"Not very good, but not bad",83.0,164.0,No,No,Household without children,2.0,0.0,0.0,0.0,1.0,627.658333,0.000000,5.655,326.636667,0.000000,0.000000,0.0,25.000000,0.000000,0.0,0.000000,984.950000,0.000000,1.183333,986.133333,332.752500,0.000000,18.015833,12.509167,0.000000,43.336667,0.0,42.525833,28.556667,0.000000,35.863333,60.962500,0.000000,0.000000,46.181667,13.168333,0.000000,0.000000,0.000000,0.000000,12.888333,7.904167,0.000000,0.000000,0.00,0.000000,0.000000,320.250000,0.0,932.389167,1.0,1.0,1.0,1.0,1.0,30.859607,53.744167,55-64,0-400
6638,3640.0,3.0,2017.0,2056,Minsk city,Minsk city,23.0,Male,795.587500,12.0,"Higher education, After Higher education",Good,75.0,181.0,"Yes, in free time",No,Household without children,3.0,0.0,0.0,0.0,1.0,1646.670833,0.000000,0.000,317.960000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,1964.630833,15.771167,52.630833,2033.032833,399.434583,52.211250,54.430833,64.367500,1.943333,57.534167,0.0,52.117500,46.004167,5.550000,21.086667,92.106667,0.000000,0.000000,69.598333,95.849167,0.000000,0.000000,83.307500,49.974167,26.252500,11.021667,0.000000,787.916667,19.50,0.000000,0.000000,26.000000,0.0,1914.115000,1.0,1.0,1.0,1.0,1.0,22.893074,118.917833,18-24,700-1000
10072,5581.0,5.0,2017.0,3008,Large city,Mogilev oblast,73.0,Female,0.000000,0.0,Vocational school,"Not very good, but not bad",110.0,165.0,No,No,2 and more adults with children,5.0,1.0,1.0,0.0,1.0,869.436667,3.833333,0.000,317.911667,89.593333,0.000000,0.0,45.833333,0.000000,0.0,0.000000,1326.608333,93.067562,24.137500,1443.813395,514.836250,68.897917,42.862500,22.541667,0.000000,126.276667,0.0,122.186667,32.430000,11.930000,57.556667,34.632500,101.816667,0.000000,26.518333,126.631667,8.048333,30.601667,4.647500,0.000000,15.283333,53.905833,19.675833,0.000000,0.00,0.000000,62.500000,6.950000,0.0,1368.543333,1.0,1.0,1.0,1.0,1.0,40.404040,75.270062,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1922,1046.0,2.0,2017.0,530,Large city,Vitebsk oblast,53.0,Female,353.595833,12.0,Secondary specialized education,Good,64.0,164.0,"Yes, in free time",No,Household without children,2.0,0.0,0.0,0.0,0.0,703.632500,0.000000,0.000,310.117500,0.000000,0.000000,0.0,0.000000,0.000000,250.0,0.000000,1263.750000,45.675733,12.730000,1322.155733,358.382917,31.527500,104.885000,40.980000,0.000000,71.711667,0.0,34.693333,26.896667,0.000000,15.490000,31.801667,1.733333,0.000000,14.663333,2.176667,0.000000,0.000000,0.000000,0.000000,17.775833,8.841667,7.386667,260.166667,41.80,7.198333,68.199167,48.333333,0.0,1159.950417,1.0,1.0,1.0,1.0,1.0,23.795360,162.205317,45-54,0-400
2741,1528.0,2.0,2017.0,410,Rural,Vitebsk oblast,56.0,Female,395.384167,12.0,Secondary specialized education,"Not very good, but not bad",66.0,158.0,"Yes, in free time",No,All adults older working age,2.0,0.0,0.0,0.0,2.0,686.232500,0.000000,0.000,505.500000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,1191.732500,83.862867,0.000000,1275.595367,474.447917,0.000000,108.329167,31.125000,0.000000,63.041667,0.0,56.473333,108.741667,0.000000,22.397500,0.000000,108.675000,0.000000,49.302500,43.135000,0.000000,0.000000,0.000000,0.000000,15.187500,13.000000,4.416667,0.000000,0.00,0.000000,0.000000,120.833333,0.0,1162.632917,1.0,1.0,1.0,1.0,1.0,26.438071,112.962450,55-64,0-400
10673,5922.0,1.0,2017.0,419,Small city,Mogilev oblast,62.0,Male,0.000000,0.0,Vocational school,"Not very good, but not bad",70.0,175.0,No,Yes,All adults older working age,2.0,0.0,0.0,0.0,2.0,0.000000,0.000000,0.000,521.255000,0.000000,0.000000,0.0,126.000000,0.000000,0.0,15.093333,662.348333,45.930052,1.750000,710.028385,202.039167,75.395833,29.447500,19.577500,0.000000,77.950833,0.0,54.615000,22.245833,54.166667,71.114167,4.640000,1.250000,0.000000,11.619167,9.350000,0.000000,0.000000,0.000000,0.000000,7.135833,14.004167,6.669167,50.000000,23.75,0.000000,0.000000,55.000000,0.0,735.355833,1.0,1.0,1.0,1.0,1.0,22.857143,-25.327448,55-64,
2426,1343.0,1.0,2017.0,566,Large city,Vitebsk oblast,68.0,Female,0.000000,0.0,"Higher education, After Higher education","Not very good, but not bad",120.0,168.0,"Yes, in free time",No,Lone adult older working age,1.0,0.0,0.0,0.0,1.0,0.000000,0.000000,0.000,291.340833,0.000000,0.000000,0.0,25.000000,0.000000,0.0,0.000000,316.340833,20.445983,0.000000,336.786817,114.715417,5.952917,7.416667,0.000000,0.000000,71.629167,0.0,67.129167,2.283333,0.000000,21.850000,0.000000,0.000000,0.000000,14.412500,15.226667,0.000000,0.000000,0.000000,0.000000,1.425000,3.250000,1.750000,0.000000,0.00,6.966667,0.000000,12.500000,0.0,279.378333,1.0,1.0,1.0,1.0,1.0,42.517007,57.408483,,


# 4. Построение таблиц сопряженности

## Пол и интервалы зарплат

In [9]:
sex_salary_crosstab = pd.crosstab(df["sex"], df["salary_categorical"])
sex_salary_crosstab

salary_categorical,0-400,400-500,500-700,700-1000,>= 1000
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,352,116,101,41,12
Male,229,92,134,81,29


## Возраст и интервалы зарплат

In [13]:
age_salary_crosstab = pd.crosstab(df["age_categorical"], df["salary_categorical"])
age_salary_crosstab

salary_categorical,0-400,400-500,500-700,700-1000,>= 1000
age_categorical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18-24,26,7,9,2,1
25-34,108,39,38,29,11
35-44,109,46,68,39,13
45-54,167,75,65,28,11
55-64,144,34,53,22,5


## Уровень образования и интервалы зарплат 

In [11]:
education_salary_crosstab = pd.crosstab(df["educat"], df["salary_categorical"])
education_salary_crosstab

salary_categorical,0-400,400-500,500-700,700-1000,>= 1000
educat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Basic education,6,1,1,1,0
"Higher education, After Higher education",108,52,77,61,30
Secondary education,137,39,41,10,4
Secondary specialized education,223,82,80,40,4
Vocational school,107,34,36,10,3


##  Уровень образования и частота занятий спортом

In [14]:
education_sport_crosstab = pd.crosstab(df["educat"], df["sport"])
education_sport_crosstab

sport,No,"Yes, in establishment of education","Yes, in free time","Yes, on professional basis"
educat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Basic education,47,1,3,0
"Higher education, After Higher education",334,0,189,6
"Primary education,Dont Educat",21,0,1,0
Secondary education,317,6,53,2
Secondary specialized education,506,1,161,2
Vocational school,253,0,40,0


## Двухмерная таблица для пола (sex) и бинарной переменной, характеризующей наличие привычки курения (smoker)

In [15]:
sex_smoker_crosstab = pd.crosstab(df["sex"], df["smoker"])
sex_smoker_crosstab

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,1052,90
Male,440,361


# 5. Проверка на равенство распределния частот для категориальных признаков с помощью критерия $\chi^2$

In [16]:
sex_salary_chi2 = stats.chi2_contingency(sex_salary_crosstab)
print("Hypothesis: Frequences of salaries are the same distributed among sexes")
print("Chi-2 statistic:", sex_salary_chi2[0], "p-value:", sex_salary_chi2[1])
sex_salary_crosstab

Hypothesis: Frequences of salaries are the same distributed among sexes
Chi-2 statistic: 50.986814951753914 p-value: 2.2464234677278504e-10


salary_categorical,0-400,400-500,500-700,700-1000,>= 1000
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,352,116,101,41,12
Male,229,92,134,81,29


In [17]:
sex_education_crosstab = pd.crosstab(df["sex"], df["educat"])
sex_education_chi2 = stats.chi2_contingency(sex_education_crosstab)
print("Hypothesis: Frequences of education lever are the same distributed among sexes")
print("Chi-2 statistic:", sex_education_chi2[0], "p-value:", sex_education_chi2[1])
sex_education_crosstab

Hypothesis: Frequences of education lever are the same distributed among sexes
Chi-2 statistic: 57.58408891170687 p-value: 3.832390361710209e-11


educat,Basic education,"Higher education, After Higher education","Primary education,Dont Educat",Secondary education,Secondary specialized education,Vocational school
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,32,355,18,205,437,128
Male,20,187,5,183,256,173


In [18]:
sex_smoker_chi2 = stats.chi2_contingency(sex_smoker_crosstab)
print("Hypothesis: Frequences of smokers are the same distributed among sexes")
print("Chi-2 statistic:", sex_smoker_chi2[0], "p-value:", sex_smoker_chi2[1])
sex_smoker_crosstab

Hypothesis: Frequences of smokers are the same distributed among sexes
Chi-2 statistic: 363.19660736842536 p-value: 5.669442070807175e-81


smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,1052,90
Male,440,361


In [19]:
sex_smoker_crosstab_higher_educat = pd.crosstab(df[df["educat"] == "Higher education, After Higher education"]["sex"], df[df["educat"] == "Higher education, After Higher education"]["smoker"])
sex_smoker_chi2_higher_educat = stats.chi2_contingency(sex_smoker_crosstab_higher_educat)
print("Hypothesis: Frequences of smokers are the same distributed among sexes for highly educated people")
print("Chi-2 statistic:", sex_smoker_chi2_higher_educat[0], "p-value:", sex_smoker_chi2_higher_educat[1])
sex_smoker_crosstab_higher_educat

Hypothesis: Frequences of smokers are the same distributed among sexes for highly educated people
Chi-2 statistic: 63.70382615013224 p-value: 1.4460345803607862e-15


smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,329,16
Male,129,55


In [20]:
secondary_education = ["Secondary education", "Secondary specialized education", "Vocational school"]
sex_smoker_crosstab_secondary_educat = pd.crosstab(df[df["educat"].isin(secondary_education)]["sex"], df[df["educat"].isin(secondary_education)]["smoker"])
sex_smoker_chi2_secondary_educat = stats.chi2_contingency(sex_smoker_crosstab_secondary_educat)
print("Hypothesis: Frequences of smokers are the same distributed among sexes for people with secondary")
print("Chi-2 statistic:", sex_smoker_chi2_secondary_educat[0], "p-value:", sex_smoker_chi2_secondary_educat[1])
sex_smoker_crosstab_secondary_educat

Hypothesis: Frequences of smokers are the same distributed among sexes for people with secondary
Chi-2 statistic: 268.5820871037936 p-value: 2.3117461753279572e-60


smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,674,74
Male,294,299


In [21]:
poor_education = ["Basic education", "Primary education,Dont Educat"]
sex_smoker_crosstab_poor_educat = pd.crosstab(df[df["educat"].isin(secondary_education)]["sex"], df[df["educat"].isin(secondary_education)]["smoker"])
sex_smoker_chi2_poor_educat = stats.chi2_contingency(sex_smoker_crosstab_poor_educat)
print("Hypothesis: Frequences of smokers are the same distributed among sexes for people with secondary")
print("Chi-2 statistic:", sex_smoker_chi2_poor_educat[0], "p-value:", sex_smoker_chi2_poor_educat[1])
sex_smoker_crosstab_poor_educat

Hypothesis: Frequences of smokers are the same distributed among sexes for people with secondary
Chi-2 statistic: 268.5820871037936 p-value: 2.3117461753279572e-60


smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,674,74
Male,294,299


# 6. Однофакторный дисперсионный анализ (one-way ANOVA)

Нулевая гипотеза: средние зарплаты по регионам (включая Минск) равны.
p-value = 0, следовательно, мы отклоняем нулевую гипотезу

In [22]:
stats.f_oneway(*[df[df["region"] == x]["ppinc_1"] for x in np.unique(df["region"])])

F_onewayResult(statistic=5.326538676139645, pvalue=1.832132202097321e-05)

Нулевая гипотеза: средние зарплаты по регионам (включая Минск) равны.
p-value = 0.99, следовательно, мы не можем отклонить нулевую гипотезу

In [23]:
stats.f_oneway(*[df[df["region"] == x]["ppinc_1"] for x in np.unique(df["region"]) if x != "Minsk city"])

F_onewayResult(statistic=0.0808037516321363, pvalue=0.995209587002555)

# 7. Многомерная Т-статистика Хотеллинга¶

In [24]:
def t2_test_2samp(X, Y):
    nx, p1 = X.shape
    ny, p2 = Y.shape 
    if (p1 != p2):
        raise ValueError(f"X and Y must have same number of columns, but X has {p1} columns and Y has {p2} columns") 
    p = p1
    delta = np.mean(X, axis=0) - np.mean(Y, axis=0)
    Sx = np.cov(X, rowvar=False)
    Sy = np.cov(Y, rowvar=False)
    S_pooled = ((nx-1)*Sx + (ny-1)*Sy)/(nx+ny-2)
    t_squared = (nx*ny)/(nx+ny) * np.matmul(np.matmul(delta.transpose(), np.linalg.inv(S_pooled)), delta)
    t2_stat = t_squared * (nx+ny-p-1)/(p*(nx+ny-2))
    F = stats.f(p, nx+ny-p-1)
    p_value = 1 - F.cdf(t2_stat)
    return t2_stat, p_value

Нулевая гипотеза: домохозяйства из двух произвольных регионов (Минск и Минская область) имеют одинаковые средние значений переменных cashinc, inkind, privlg
p-value = 0, следовательно, мы отклоняем нулевую гипотезу

In [25]:
t2_stat, p = t2_test_2samp(df[df["region"] == "Minsk city"][["cashinc", "inkind", "privlg"]].dropna(), df[df["region"] == "Minsk oblast"][["cashinc", "inkind", "privlg"]].dropna())
print(f"Test statistic: {t2_stat}\np-value: {p}")

Test statistic: 53.46756829200893
p-value: 1.1102230246251565e-16


Нулевая гипотеза: домохозяйства из двух произвольных регионов (Минская и Минская области) имеют одинаковые средние значений переменных cashinc, inkind, privlg
p-value = 0, следовательно, мы отклоняем нулевую гипотезу

In [26]:
t2_stat, p = t2_test_2samp(df[df["region"] == "Minsk city"][["cashinc", "inkind", "privlg"]].dropna(), df[df["region"] == "Minsk oblast"][["cashinc", "inkind", "privlg"]].dropna())
print(f"Test statistic: {t2_stat}\np-value: {p}")

Test statistic: 53.46756829200893
p-value: 1.1102230246251565e-16
