In [None]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import json
from collections import Counter
from scipy import stats
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from itertools import chain
import seaborn as sns
import ray
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from indirect_utils import (
    DispatchEstimator,
    fullspace,
    generate_x_y,
    get_delta,
    logodds,
    stratified_permute,
    tologodds,
    trimmed,
    identity,
)
import random
import math
from tqdm import tqdm

%matplotlib inline

In [None]:
residence_info = pd.read_csv("data/residence_info.csv")


def read_data(filename):
    return (
        pd.read_csv(filename)
        .merge(residence_info[["residence", "elevation"]], on="residence", how="left")
        .sort_values(["year_of_birth", "type", "sex"])
        .reset_index(drop=True)
    )


data_ITM = read_data("data/ITM.csv")
data_russian = read_data("data/russian.csv").rename(columns={"русский": "russian"})

In [None]:
full = (
    fullspace(data_ITM, ["type", "sex", "residence", "year_of_birth"])
    .merge(residence_info, on="residence", how="left")
    .sort_values(["year_of_birth", "type", "sex", "residence"])
)

In [None]:
russian_to_target = {True: "russian", False: "number of lang"}

In [None]:
ray.init()

In [None]:
def bootstrap_data(data):
    return (
        data.groupby("type")
        .apply(lambda x: x.sample(frac=1, replace=True))
        .reset_index(drop=True)
        .sort_values(["year_of_birth", "type"])
        .reset_index(drop=True)
    )

In [None]:
@ray.remote
def predict_de(
    data,
    prediction_space,
    estimator,
    ct,
    russian,
    permute,
    permute_strats=6,
    delta=0,
    seed=None,
    bootstrap=False,
):
    """
    predict with DispatchEstimator
    
    == Params ==
    - data: data to train
    - prediction_space: values to predict on
    - estimator: base estimator to construct DispatchEstimator
    - ct: ColumnTransformer; make sure that first column is type
    - russian: bool: will we predict Russian (otherwise ITM)
    - permute: should we permute type before training
    - permute_strats: number of strats to permute
    - delta: simulated effect size; keep 0 if you want to simulate null distribution
    - bootstrap: make a bootstrapped sample before training
    """

    assert delta == 0 or not russian, "delta supported only in ITM"

    assert not bootstrap or not permute, "bootstrap and permute are mutually exclusive"

    if seed:
        np.random.seed(seed)
        random.seed(seed)

    target = russian_to_target[russian]

    prediction_space_adj = prediction_space[
        ["type"] + list(data.drop(columns=[target, "type"]).columns)
    ]

    model = Pipeline(
        [
            ("ct", ct),  # make sure ct's first column is type
            ("estimator", DispatchEstimator([clone(estimator), clone(estimator)])),
        ]
    )
    if bootstrap:
        data = bootstrap_data(data)

    if permute:
        type_new = stratified_permute(data["type"], strats=permute_strats)
    else:
        type_new = data["type"]

    data_permuted = pd.concat(
        [
            type_new.reset_index(drop=True),
            data.drop(columns=["type"]).reset_index(drop=True),
        ],
        axis=1,
    )

    if delta != 0:
        data_permuted.loc[data_permuted["type"] == 0, target] += delta / 2
        data_permuted.loc[data_permuted["type"] == 1, target] -= delta / 2

    model.fit(data_permuted.drop(columns=[target]), data_permuted[target])

    if russian:
        pred = model.predict_proba(prediction_space_adj)[:, 1]
    else:
        pred = model.predict(prediction_space_adj)

    return pred

In [None]:
def permutation_delta(
    data,
    prediction_space,
    estimator,
    number_of_permutations,
    russian,
    ct,
    statistics=(identity,),
    null_delta=0,
    groupby_columns=("year_of_birth",),
    use_logodds=False,
    iter_offset=0,
    bootstrap=False,
    seed=42,
):
    stat_names = ["delta_" + stat.__name__ for stat in statistics]

    groupby_columns = list(groupby_columns)

    r = []

    predictions_futures = [
        predict_de.remote(
            data,
            prediction_space,
            estimator,
            ct,
            russian,
            permute=not bootstrap,
            delta=null_delta,
            bootstrap=bootstrap,
            seed=i + iter_offset + seed,
        )
        for i in range(number_of_permutations)
    ]

    predictions = ray.get(predictions_futures)

    r = [
        prediction_space[["type"] + groupby_columns].assign(
            pred=pred, iter=[it] * prediction_space.shape[0]
        )
        for it, pred in enumerate(predictions, start=iter_offset)
    ]

    results = pd.concat(r, axis=0).reset_index(drop=True)
    results.columns = list(["type"] + groupby_columns) + ["pred", "iter"]

    delta = (
        get_delta(results, use_logodds=use_logodds)
        .assign(
            **{
                stat_name: lambda x, stat=stat: stat(x["delta"])
                for stat_name, stat in zip(stat_names, statistics)
            }
        )[groupby_columns + stat_names + ["iter"]]
        .groupby(groupby_columns + ["iter"])
        .mean()
        .reset_index()
    )
    return delta

In [None]:
def concat_wrap(
    f, number_of_permutations, *args, permutations_per_iteration=1000, **kwargs
):
    assert number_of_permutations % permutations_per_iteration == 0

    number_of_splits = number_of_permutations // permutations_per_iteration
    return pd.concat(
        [
            f(
                *args,
                number_of_permutations=permutations_per_iteration,
                iter_offset=i * permutations_per_iteration,
                **kwargs
            )
            for i in tqdm(range(number_of_splits))
        ],
        axis=0,
    ).reset_index(drop=True)

### ITM

In [None]:
def get_deltas_and_full_pred(
    data,
    estimator,
    data_real,
    data_cat,
    number_of_permutations,
    russian,
    null_delta=0,
    permutations_per_iteration=1000,
    bootstrap=False,
):

    target = russian_to_target[russian]

    data = data[data_real + data_cat + [target]]

    ct = ColumnTransformer(
        [("real", "passthrough", data_real), ("catenc", OneHotEncoder(), data_cat)],
        sparse_threshold=0,
    )

    full_pred = full.assign(
        pred=lambda x: predict_de._function(
            data, x, estimator, ct, russian, permute=False, seed=42
        )
    )

    def get_deltas(bootstrap):
        return concat_wrap(
            permutation_delta,
            number_of_permutations=number_of_permutations,
            permutations_per_iteration=permutations_per_iteration,
            data=data,
            prediction_space=full,
            estimator=estimator,
            russian=russian,
            ct=ct,
            null_delta=null_delta,
            statistics=(identity, np.abs),
            use_logodds=russian,
            bootstrap=bootstrap,
        ).rename(columns={"delta_identity": "delta"})

    return get_deltas(bootstrap=False), get_deltas(bootstrap=True), full_pred

In [None]:
number_of_permutations = 10000
permutations_per_iteration = 1000

In [None]:
delta_ITM_perm, delta_ITM_bootstrap, pred_ITM_full = get_deltas_and_full_pred(
    data=data_ITM,
    estimator=GradientBoostingRegressor(max_depth=4, n_estimators=100),
    data_cat=["mother tongue", "residence"],
    data_real=["type", "year_of_birth", "language population", "village population"],
    number_of_permutations=number_of_permutations,
    permutations_per_iteration=permutations_per_iteration,
    russian=False,
)

In [None]:
delta_ITM_perm.to_csv("delta_itm_perm_gbr.csv", index=False)
delta_ITM_bootstrap.to_csv("delta_itm_bootstrap_gbr.csv", index=False)
pred_ITM_full.to_csv("pred_itm_full_gbr.csv", index=False)

In [None]:
(
    delta_russian_perm,
    delta_russian_bootstrap,
    pred_russian_full,
) = get_deltas_and_full_pred(
    data=data_russian,
    estimator=GradientBoostingClassifier(max_depth=1, n_estimators=150),
    data_cat=["mother tongue", "sex"],
    data_real=[
        "type",
        "year_of_birth",
        "language population",
        "elevation",
        "village population",
    ],
    number_of_permutations=number_of_permutations,
    permutations_per_iteration=permutations_per_iteration,
    russian=True,
)

In [None]:
delta_russian_perm.to_csv("delta_russian_perm_gbr.csv", index=False)
delta_russian_bootstrap.to_csv("delta_russian_bootstrap_gbr.csv", index=False)
pred_russian_full.to_csv("pred_russian_full_gbr.csv", index=False)