In [2]:
from zipfile import ZipFile
import os

# Define the path to the zip file and the extraction directory
zip_file_path = './dataverse_files.zip'
extraction_dir = './dataverse_files/'

# Create a directory to extract to
os.makedirs(extraction_dir, exist_ok=True)

# Extract the zip file
with ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

# List the contents of the directory to see what was extracted
extracted_files = os.listdir(extraction_dir)
extracted_files


['galton.dta', 'galton-stata11.dta', 'readme.txt']

In [3]:
import pandas as pd

# Load the dataset from the .dta file
df_path = extraction_dir + 'galton.dta'
df = pd.read_stata(df_path)

# Display the first few rows of the DataFrame to understand its structure
df

Unnamed: 0,family,father,mother,gender,height,kids,male,female
0,1,78.5,67.0,M,73.199997,4,1.0,0.0
1,1,78.5,67.0,F,69.199997,4,0.0,1.0
2,1,78.5,67.0,F,69.000000,4,0.0,1.0
3,1,78.5,67.0,F,69.000000,4,0.0,1.0
4,2,75.5,66.5,M,73.500000,4,1.0,0.0
...,...,...,...,...,...,...,...,...
893,136A,68.5,65.0,M,68.500000,8,1.0,0.0
894,136A,68.5,65.0,M,67.699997,8,1.0,0.0
895,136A,68.5,65.0,F,64.000000,8,0.0,1.0
896,136A,68.5,65.0,F,63.500000,8,0.0,1.0


In [4]:
from sklearn.linear_model import LinearRegression

# Prepare the data
df['Gender'] = df['male'].apply(lambda x: 0 if x == 1 else 1)  # Convert 'male' column to 'Gender' with 0 for male, 1 for female
df.rename(columns={'height': 'Height'}, inplace=True)  # Ensure column names are consistent with the task description

# Separate the dataset by gender
df_male = df[df['Gender'] == 0]
df_female = df[df['Gender'] == 1]

In [5]:

# Task 1
# Fit linear regression models
lr_male = LinearRegression().fit(df_male[['father', 'mother']], df_male['Height'])
lr_female = LinearRegression().fit(df_female[['father', 'mother']], df_female['Height'])

# Output the coefficients and intercepts for each model
male_coefficients, male_intercept = lr_male.coef_, lr_male.intercept_
female_coefficients, female_intercept = lr_female.coef_, lr_female.intercept_

male_coefficients, male_intercept, female_coefficients, female_intercept


(array([0.4117501 , 0.33355007], dtype=float32),
 19.399895,
 array([0.40071717, 0.30745602], dtype=float32),
 16.610233)

In [6]:
# Task 2: Implementation
def counterfactual_inference(gender, f_prime, m_prime):
    return gender.predict([[f_prime, m_prime]])[0]

# Select the first family in dataset
f_prime, m_prime, height, gender = df.loc[0]['father'], df.loc[0]['mother'], df.loc[0]['Height'], df.loc[0]
h_prime = counterfactual_inference(lr_male, f_prime, m_prime)
h_cf = counterfactual_inference(lr_male, 76, 68)
h_prime, h_cf




(74.07013, 73.37430739402771)

In [7]:
# Task 3 (1)
print(df.loc[0], '\n', df.loc[577])
h_cf1 = counterfactual_inference(lr_male, 69, 64) # male is 0
h_cf2 = counterfactual_inference(lr_male, 69, 64) # male is 0
print(f'{h_cf1} {h_cf2}')
print(f'Potential outcome is {'same' if h_cf1 == h_cf2 else 'different'}.')

family            1
father         78.5
mother         67.0
gender            M
Height    73.199997
kids              4
male            1.0
female          0.0
Gender            0
Name: 0, dtype: object 
 family     134
father    68.0
mother    65.0
gender       M
Height    72.0
kids         4
male       1.0
female     0.0
Gender       0
Name: 577, dtype: object
69.15785637497902 69.15785637497902
Potential outcome is same.




In [8]:
# Task 3 (2)
h_cf1 = counterfactual_inference(lr_male, 69, 64) # male is 0
h_cf2 = counterfactual_inference(lr_male, 69, 64) # male is 0
print(f'Height prediction: {h_cf2}')

Height prediction: 69.15785637497902




Prediction is same as potential outcome. Because of modularity, 'gender', 'father', and 'mother' are the only causes of 'height', given 'gender', counterfactual outcome (do operation) of 'father' and 'mother' can be transformed to the prediction (or posterior).

$P(X|do(Y=y)) = P(X|Y=y)$