In [79]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import f


In [81]:
data = pd.read_excel('wage.xlsx')
data['wage'] = np.log(data['wage'])
female_data = data[data['male'] == 0]['wage'].values
male_data = data[data['male'] != 0]['wage'].values

### Calculate the basic statistics

In [84]:
female_var = np.var(female_data, ddof=1)
male_var = np.var(male_data, ddof=1)

female_mean = np.mean(female_data)
male_mean = np.mean(male_data)

female_n = len(female_data)
male_n = len(male_data)

### Roubust test

In [87]:
part_1 = (np.sum((female_data - female_mean)** 4) + np.sum((male_data - male_mean)** 4)) * (female_n + male_n)
part_2 = (np.sum((female_data - female_mean)** 2) + np.sum((male_data - male_mean)** 2)) ** 2
kappa = part_1/part_2


T = np.sqrt(female_n * male_n / (male_n + female_n)) * (np.log(female_var) - np.log(male_var))
statistics = T/np.sqrt(kappa-1)
p_value = 1 - norm.cdf(statistics)

print('The estimated kappa is:', kappa)
print('The robust statistics is:', statistics)
print('The robust p-value is:', p_value)


The estimated kappa is: 7.3781207090510135
The robust statistics is: 0.9353004940278131
The robust p-value is: 0.17481673039385381



### F test

In [90]:
F_statistics = female_var/male_var
F_critical_value = f.ppf(1 - 0.05, female_n-1, male_n-1)

print('The F statistics is:', F_statistics)
print('The F critical value is:', F_critical_value)

The F statistics is: 1.0858707722186227
The F critical value is: 1.0844642019945046
