## Install packages

In [1]:
# Choosing a CRAN Mirror
import rpy2.robjects.packages as rpackages
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

# Installing required packages
from rpy2.robjects.vectors import StrVector
packages = ('devtools')
utils.install_packages(StrVector(packages))

R[write to console]: Installing packages into ‘/home/nlubalo/R/x86_64-pc-linux-gnu-library/4.1’
(as ‘lib’ is unspecified)



<rpy2.rinterface_lib.sexp.NULLType object at 0x7f9784098c00> [RTYPES.NILSXP]

In [2]:
# Import packages
from rpy2.robjects.packages import importr
devtools = importr('devtools')

# Import Functions
install_github = devtools.install_github

In [3]:
repo = "LendieFollett/Multivariate-Heterogenous-Response-Prediction/SharedForestBinary-master/SharedForestBinary"
install_github(repo)
install_github("theodds/SharedForestPaper/SharedForest")

R[write to console]: Skipping install of 'SharedForestBinary' from a github remote, the SHA1 (55836d98) has not changed since last install.
  Use `force = TRUE` to force installation

R[write to console]: Skipping install of 'SharedForest' from a github remote, the SHA1 (00d91f42) has not changed since last install.
  Use `force = TRUE` to force installation



0
'SharedForest'


In [4]:
SharedForest, SharedForestBinary  = importr('SharedForest'), importr('SharedForestBinary')

In [5]:
# Allow conversion
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()

## Import Libraries

In [8]:
from collections import OrderedDict
from hyperopt import tpe, Trials, hp, fmin, STATUS_OK, STATUS_FAIL
import matplotlib
from sklearn.model_selection import train_test_split
from scipy.stats import norm
import pandas as pd
import numpy as np
import time
import shutil
import pickle
import os
import datetime
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri


## Import Data

In [13]:
# Specify the column name and mapping for the labels we are trying to predict
col = [
    'Q1. How should I greet you?',
    'Q7. Do you own your own house or agricultural land, either solely or jointly with someone else?',
    # 'Q6. Thinking about the last 12 months, how much input did you have in decisions about how to use total HOUSEHOLD income?'
]

mapping = {
    col[0]: ({1: 1,  # Women
              2: 0,  # Men
              }),
    col[1]: ({1: 0,  # Land
              2: 1,  # House
              3: 1,  # Both
              4: 0,  # Neither
              }),
    # col[1]: ({
    #         1: 0,# Respondent
    #         2: 1,# Spouse
    #         3: 0,# Respondent and spouse
    #         4: 1,# Other
    #         })
}


def create_train_test(column_name, mapping):
    # load features
    features_path = 'data/features_20191108.csv'
    df = pd.read_csv(features_path, dtype={'msisdn': str})
    df.set_index('msisdn', inplace=True, drop=True)
    # load labels
    path = 'data/Gender Survey Data - All Zones 07-11-2019 V4.xlsx'
    targets = pd.read_excel(path, dtype={"MSISDN": str}, skiprows=[
                            1]).set_index('MSISDN', drop=True)

    # disregard unnecessary columns, map values to binary representation, and drop null values
    targets = targets[column_name]
    for c in column_name:
        targets['target_{}'.format(c)] = targets[c].map(mapping[c])
        targets.drop(c, axis=1, inplace=True)
    targets = targets.dropna().astype('str').agg(
        '-'.join, axis=1).to_frame('target')

    # create new momo columns
    df["momo_p2p_received_balance_dif_avg_neg"] = [
        1 if x < 0 else 0 for x in df["momo_p2p_received_balance_dif_avg"]]
    df["momo_p2p_sent_balance_dif_avg_neg"] = [
        1 if x < 0 else 0 for x in df["momo_p2p_sent_balance_dif_avg"]]

    # drop columns that have more than a certain fraction (e.g. 50%) of null values
    threshold = 0.5
    df = df[[x for x in df.columns if (
        df[x].isna().sum() / df.shape[0] < threshold)]]

    # drop irrelevant columns and merge with labels
    df = df.drop(["SITE_ID", "NAME_1", "NAME_3"], axis=1)
    df = df.merge(targets, left_index=True, right_index=True, how="inner")

    # one-hot encode second distric names
    one_hot = pd.get_dummies(df['NAME_2'])
    # fill missing values with zeros - empirically better than using mean or median
    df_zero = df.fillna(0)
    # merge with one-hot encoded features and drop it from df
    data = df_zero.merge(one_hot, left_index=True, right_index=True)
    data = data.drop(['NAME_2'], axis=1).sample(40)

    X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1), data['target'], test_size=0.2,
                                                        random_state=42, stratify=data['target'])

    y_train = y_train.str.split('-', expand=True)
    y_train.columns = column_name
    y_test = y_test.str.split('-', expand=True)
    y_test.columns = column_name

    for c in column_name:
        y_train[c] = pd.to_numeric(y_train[c], errors='coerce').astype(int)
        y_test.iloc[:, 0] = pd.to_numeric(
            y_test.iloc[:, 0], errors='coerce').astype(int)
        y_test.iloc[:, 1] = pd.to_numeric(
            y_test.iloc[:, 1], errors='coerce').astype(int)

    X_train = X_train.to_numpy()
    X_test = X_test.to_numpy()

    return X_train, y_train, X_test, y_test


def create_model_inputs(column_name=col, mapping=mapping):
    # Create train test data
    X_train, y_train, X_test, y_test = create_train_test(column_name, mapping)

    # Convert to R dataframe
    W = ro.conversion.py2rpy(X_train)
    W_test = ro.conversion.py2rpy(X_test)

    # Split Y into delat1 and delta2
    delta1 = ro.conversion.py2rpy(y_train.iloc[:, 0])
    delta2 = ro.conversion.py2rpy(y_train.iloc[:, 1])

    return W, W_test, delta1, delta2, y_test



In [16]:
start = time.time()
def get_objective(W, W_test, delta1, delta2, y_test):
    def objective(space):
        """
        The objective contains the function to be optimized(shared forest model) .
        :param space:
        :return:
        """
        opts = SharedForestBinary.Opts(
            num_burn=5000, num_thin=1, num_save=5000, num_print=10000)

        hypers = SharedForestBinary.Hypers(
            W=W,  # Training covariate matrix
            delta1=delta1,  # first binary response vector
            delta2=delta2,  # second binary response vector
            alpha=space['alpha'],
            beta=int(space['beta']),
            gamma=space['gamma'],
            num_tree=int(space['num_tree']),
            var_tau=space['var_tau'],
            k=2,  # Determines kappa
            k_theta=2,
        )

        # BART with shared forest model
        sb = SharedForestBinary.SharedBartBinary(
            W=W,  # Training covariate matrix
            delta1=delta1,  # first binary response vector
            delta2=delta2,  # second binary response vector
            W_test=W_test,
            hypers_=hypers,  # hypers from above
            opts_=opts)  # opts from above

        # Helper Funtion

        def apply_fx(x):
            return np.random.binomial(p=norm.cdf(x), n=1)

        # Predictions
        delta_star1 = [apply_fx(x) for x in sb[5]]
        delta_star2 = [apply_fx(x) for x in sb[6]]

        # Conditional on delta1 above, predict delta2
        # estimated P(delta2 = 1 | delta1 = 1)
        sb_y_pred1_ = [norm.cdf(x) for x in (
            delta_star1 * sb[5]).sum(axis=0) / pd.DataFrame(delta_star1).sum(axis=0)]
        sb_y_pred1 = sum(sb_y_pred1_) / len(sb_y_pred1_)

        hours = int((time.time() - start) // (60 * 60))
        mins = int(((time.time() - start) // 60) % 60)

        print('{} hours {} minutes have passed'.format(hours, mins))

        if np.isnan(sb_y_pred1):
            status = STATUS_FAIL
        else:
            status = STATUS_OK
        # Since we only minimize using fmin in hyperopt, to maximize the P(delta2 = 1 | delta1 = 1) we use
        # 1- P(delta2 = 1 | delta1 = 1) as the loss we mimimize
        return {'loss': 1-sb_y_pred1,
                'status': status,
                'g_a': sb_y_pred1,
                'hyper': space}
    return objective


def summarize_trials(trials):
    results = trials.trials
    results = sorted(results, key=lambda x: -x['result']['g_a'])
    if results:
        print('Best: {}'.format(results[0]['result']))

    results = sorted(results, key=lambda x: -x['result']['g_a'])

    if results:
        print('Best test accuracy: {}'.format(results[0]['result']))


def optimize(objective, space, trials_fname=None, max_evals=1):
    if trials_fname is not None and os.path.exists(trials_fname):
        with open(trials_fname, 'rb') as trials_file:
            trials = pickle.load(trials_file)
    else:
        trials = Trials()

    fmin(objective,
         space=space,
         algo=tpe.suggest,
         trials=trials,
         max_evals=max_evals)

    if trials_fname is not None:
        temporary = '{}.temp'.format(trials_fname)
        with open(temporary, 'wb') as trials_file:
            pickle.dump(trials, trials_file)
        shutil.move(temporary, trials_fname)

    return trials


def main(max_evals):

    W, W_test, delta1, delta2, y_test = create_model_inputs(col, mapping)
    dtime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    fname = './model_results_{}.pickle'.format(dtime)

    objective = get_objective(W, W_test, delta1, delta2, y_test)
    # Optimization space
    space = OrderedDict([('alpha', hp.choice('alpha', range(1, 3, 1))),
                         ('beta', hp.choice('beta', range(1, 4, 1))),
                         ('gamma', hp.uniform('gamma', 0.5, 1.0)),
                         ('num_tree', hp.choice('num_tree', range(50, 200, 5))),
                         ('var_tau', hp.uniform('var_tau', 0.5, 1.0)),
                         ('k_theta', hp.choice('k_theta', range(0, 1, 1))),
                         ])
    trials = optimize(objective,
                      space,
                      trials_fname=fname,
                      max_evals=max_evals)

    summarize_trials(trials)

    return trials


trials_final = main(max_evals=3)

Finishing warmup BLAH                                                           
0                                                                               
Finishing save                                                                  
0                                                                               
Number of leaves at final iterations:                                           

3                                                                               
3                                                                               
5                                                                               
6                                                                               
3                                                                               
2                                                                               
5                                                                               
4                          

4                                                                               
2                                                                               
7                                                                               
5                                                                               
5                                                                               
4                                                                               
6                                                                               
3                                                                               
3                                                                               
3                                                                               
4                                                                               
2                                                                               
6                           

5                                                                               
2                                                                               
3                                                                               
3                                                                               
3                                                                               
3                                                                               
3                                                                               
2                                                                               
1                                                                               
1                                                                               
2                                                                               
4                                                                               
2                           

1                                                                               
3                                                                               
1                                                                               
2                                                                               
2                                                                               
1                                                                               
2                                                                               
2                                                                               
1                                                                               
2                                                                               
5                                                                               
2                                                                               
3                           

4                                                                               
3                                                                               
2                                                                               
2                                                                               
1                                                                               
2                                                                               
2                                                                               
2                                                                               
2                                                                               
3                                                                               
2                                                                               
3                                                                               
6                           

In [None]:
# 4 days
#