In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [75]:
sample1 = pd.read_csv("first_sample.csv")
len(sample1)

8984

In [78]:
def preprocess_data(data: pd.DataFrame, start_year, t_limit, t_col="HOURS_PER_WEEK_COMPUTER"):
    """
    Remove unneccesary rows due to missing values, get relavent data for trial
    """
    data = data[(data > 0).all(1)]
    data = data[(data[f"{t_col}_{start_year}"] < t_limit)]
    data[f"{t_col}_{start_year + 1}"] = data[f"{t_col}_{start_year + 1}"].apply(lambda x: int(x >= t_limit))
    return data

In [82]:
processed_sample1 = preprocess_data(sample1, 2010, 6)
len(processed_sample1)
processed_sample1["HOURS_PER_WEEK_COMPUTER_2011"].value_counts()

0    395
1    128
Name: HOURS_PER_WEEK_COMPUTER_2011, dtype: int64

In [90]:
def transform_data(data: pd.DataFrame, t_cols, y_cols, t_limit):
    """
    Normalize values, calculate propensity, ...
    :data: complete dataframe
    :t_cols: columns of treatment in order, for example (COMPUTER_USAGE_2009, COMPUTER_USAGE_2010)
    :y_cols: columns of result in order, for example (HOURS_SLEPT_2009, HOURS_SLEPT_2010)
    :t_limit: number at which we put the bound for having recieved treatment or not
    """
    cols = [c for c in data.columns if c not in t_cols and c not in y_cols]
    X = data[cols]
    X = pd.get_dummies(X)
    scaler = MinMaxScaler()
    scaler.fit(X)
    X_normalized = scaler.transform(X)

    t = data[t_cols[1]]

    clf = LogisticRegression(random_state=0).fit(X_normalized, t)
    prob = clf.predict_proba(X_normalized)[: , 1]
    data['propensity'] = prob

    data_1 = data[data[t_cols[1]] == 1]
    data_0 = data[data[t_cols[1]] == 0]
    return data_0, data_1

In [91]:
sample1_0, sample1_1 = transform_data(processed_sample1, 
                                     ["HOURS_PER_WEEK_COMPUTER_2010", "HOURS_PER_WEEK_COMPUTER_2011"],
                                     ["HOURS_PER_NIGHT_SLEEP_2010", "HOURS_PER_NIGHT_SLEEP_2011"],
                                     6)
sample1_0, sample1_1

(      PUBID  GENDER  ENROLL_STAT_2010  HH_SIZE_2010  HIGHEST_DEGREE_2010  \
 4         5       1                 3             3                    2   
 21       22       1                 4             5                    2   
 50       51       1                 4             2                    2   
 51       52       1                 9             7                    2   
 60       61       2                 5             5                    3   
 ...     ...     ...               ...           ...                  ...   
 8861   8899       2                 4             4                    2   
 8875   8913       1                 3             3                    2   
 8878   8916       2                 3             3                    2   
 8916   8954       2                 2             5                    1   
 8963   9002       2                 4             6                    2   
 
       MARITAL_STAT_2010  BIO_CHILD_HH_2010  URBAN-RURAL_2010  INCOME_2010

In [None]:
def calc_IPW_ATT(data_0, data_1, y_col):
    IPW_substructed_upper = sum(data_0[y_col] * data_0['propensity'] / (1 - data_0['propensity']))
    IPW_substructed_lower = sum(data_0['propensity'] / (1 - data_0['propensity']))
    IPW_ATT_1 = data_1[y_col].mean() - (IPW_substructed_upper / IPW_substructed_lower)
    return IPW_ATT_1