In [1]:
## check confounding split

In [1]:
import sys
import os

sys.path.append("..")

In [2]:
import pandas as pd
import numpy as np

import random
import itertools
from sklearn import metrics
from tqdm.notebook import tqdm


import torch

from myutils import number_split, create_mix

In [3]:
df = pd.read_csv('./processed_db.csv', index_col=0)
df_wls = df[df['source'] == 0]
df_pitts = df[df['source'] == 1]

In [4]:
n_test = 200 # the number of testing examples; set to None to disable (i.e., get as many examples as possible)

p_wls = (df_wls.gender.value_counts()/len(df_wls))[1]
p_pitts = (df_pitts.gender.value_counts()/len(df_pitts))[1]

p_z1_mix = (df.source.value_counts()/len(df))[1]


# p_wls = 0.54
# p_pitts = 0.64
# p_z1_mix = 0.18

valid_high_combinations = []
valid_full_settings = []
for combination in itertools.product([p_wls], 
                                    [p_pitts], 
                                    [p_z1_mix],
                                     np.arange(0,4,0.1), #range alpha from 0 to 4
                                     [4], # train/test ratio
                                     [n_test]
                                    ):
    
    ret = number_split( p_pos_train_z0=combination[0], p_pos_train_z1 = combination[1], 
                       p_mix_z1 = combination[2], alpha_test = combination[3],
                        train_test_ratio = combination[4], 
                            n_test=n_test
                       )
    if ret is not None:
        valid_full_settings.append(ret)
   


Invalid test set probability P(Y=1|Z=0):0.4571035517758879, P(Y=1|Z=1):1.0056278139069534
Invalid test set probability P(Y=1|Z=0):0.45031725497443476, P(Y=1|Z=1):1.0357296864412
Invalid test set probability P(Y=1|Z=0):0.4437295131722714, P(Y=1|Z=1):1.0649508316134515
Invalid test set probability P(Y=1|Z=0):0.4373317379599162, P(Y=1|Z=1):1.0933293448997905
Invalid test set probability P(Y=1|Z=0):0.43111582920500113, P(Y=1|Z=1):1.1209011559330029
Invalid test set probability P(Y=1|Z=0):0.42507414083851824, P(Y=1|Z=1):1.1477001802639992
Invalid test set probability P(Y=1|Z=0):0.4191994494781511, P(Y=1|Z=1):1.1737584585388232
Invalid test set probability P(Y=1|Z=0):0.4134849256179648, P(Y=1|Z=1):1.199106284292098
Invalid test set probability P(Y=1|Z=0):0.4079241071428571, P(Y=1|Z=1):1.2237723214285712
Invalid test set probability P(Y=1|Z=0):0.4025108749518198, P(Y=1|Z=1):1.2477837123506414
Invalid test set probability P(Y=1|Z=0):0.3972394304966851, P(Y=1|Z=1):1.2711661775893923
Invalid tes

In [5]:
st = valid_full_settings[1]

In [8]:
from sklearn.model_selection import train_test_split

In [10]:
def create_mix(df1, df0, target, setting, sample = False, seed = 2023):
    
    n_total = len(df1) + len(df0)
    
    # check if there is enough positive samples in each dataset
    n_z0_pos = setting['n_z0_pos_train'] + setting['n_z0_pos_test']
    n_z1_pos = setting['n_z1_pos_train'] + setting['n_z1_pos_test']
    n_z0_neg = setting['n_z0_neg_train'] + setting['n_z0_neg_test']
    n_z1_neg = setting['n_z1_neg_train'] + setting['n_z1_neg_test']
    
    df0_pos = df0[df0[target] == 1]
    df1_pos = df1[df1[target] == 1]
    
    
    df0_neg = df0[df0[target] == 0]
    df1_neg = df1[df1[target] == 0]
    
     
    # for z0 positive    
    if n_z0_pos <= len(df0_pos):        
        df0_train_pos, df0_test_pos = train_test_split(df0_pos, 
                                                       train_size=setting['n_z0_pos_train'], 
                                                       test_size=setting['n_z0_pos_test'], 
                                                       shuffle = True, random_state=seed)
    elif sample: 
        df0_pos_extra = df0_pos.sample(n = n_z0_pos - len(df0_pos), replacement = True)
        df0_pos_sampled = pd.concat([df0_pos,df0_pos_extra], axis = 0, ignore_index=True)
        df0_train_pos, df0_test_pos = train_test_split(df0_pos_sampled, 
                                                       train_size=setting['n_z0_pos_train'], 
                                                       test_size=setting['n_z0_pos_test'], 
                                                       shuffle = True, random_state=seed)
    else:
        assert n_z0_pos <= len(df0_pos), "Set sample equals to True or augment current dataset."
        
    # for z0 negative
    if n_z0_neg <= len(df0_neg):        
        df0_train_neg, df0_test_neg = train_test_split(df0_neg, 
                                                       train_size=setting['n_z0_neg_train'], 
                                                       test_size=setting['n_z0_neg_test'], 
                                                       shuffle = True, random_state=seed)
    elif sample: 
        df0_neg_extra = df0_neg.sample(n = n_z0_neg - len(df0_neg), replacement = True)
        df0_neg_sampled = pd.concat([df0_neg,df0_neg_extra], axis = 0, ignore_index=True)
        df0_train_neg, df0_test_neg = train_test_split(df0_neg_sampled, 
                                                       train_size=setting['n_z0_neg_train'], 
                                                       test_size=setting['n_z0_neg_test'], 
                                                       shuffle = True, random_state=seed)
    else:
        assert n_z0_neg <= len(df0_neg), "Set sample equals to True or augment current dataset."
        
    
    
    # for z1 positive    
    if n_z1_pos <= len(df1_pos):        
        df1_train_pos, df1_test_pos = train_test_split(df1_pos, 
                                                       train_size=setting['n_z1_pos_train'], 
                                                       test_size=setting['n_z1_pos_test'], 
                                                       shuffle = True, random_state=seed)
    elif sample: 
        df1_pos_extra = df1_pos.sample(n = n_z1_pos - len(df1_pos), replacement = True)
        df1_pos_sampled = pd.concat([df1_pos,df1_pos_extra], axis = 0, ignore_index=True)
        df1_train_pos, df1_test_pos = train_test_split(df1_pos_sampled, 
                                                       train_size=setting['n_z1_pos_train'], 
                                                       test_size=setting['n_z1_pos_test'], 
                                                       shuffle = True, random_state=seed)
    else:
        assert n_z1_pos <= len(df1_pos), "Set sample equals to True or augment current dataset."
    
     # for z1 negative
    if n_z1_neg <= len(df1_neg):        
        df1_train_neg, df1_test_neg = train_test_split(df1_neg, 
                                                       train_size=setting['n_z1_neg_train'], 
                                                       test_size=setting['n_z1_neg_test'], 
                                                       shuffle = True, random_state=seed)
    elif sample: 
        df1_neg_extra = df1_neg.sample(n = n_z1_neg - len(df1_neg), replacement = True)
        df1_neg_sampled = pd.concat([df1_neg,df1_neg_extra], axis = 0, ignore_index=True)
        df1_train_neg, df1_test_neg = train_test_split(df1_neg_sampled, 
                                                       train_size=setting['n_z1_neg_train'], 
                                                       test_size=setting['n_z1_neg_test'], 
                                                       shuffle = True, random_state=seed)
    else:
        assert n_z1_neg <= len(df1_neg), "Set sample equals to True or augment current dataset."
    
    
    # assemble mixed train and test
    df_train = pd.concat([df0_train_pos, df0_train_neg, df1_train_pos, df1_train_neg], axis = 0, ignore_index=True)
    df_test = pd.concat([df0_test_pos, df0_test_neg, df1_test_pos, df1_test_neg], axis = 0, ignore_index=True)
    
    
    return df_train, df_test
    
    

In [7]:
dfs = create_mix(df1 = df_pitts, df0 = df_wls, target='gender', setting= st, sample = True)

In [8]:
dfs['test']

Unnamed: 0,file,gender,age,text,source,target
0,2000012775,1,71.0,children stealing cookies out of a cookie jar....,0,wls_female
1,2000001278,1,57.0,oh a boy a girl a mom. boy standing on a chair...,0,wls_female
2,2000010064,1,71.0,child you mean like exactly what I see. child ...,0,wls_female
3,2000015886,1,71.0,the girl is asking for a cookie. is that what ...,0,wls_female
4,2000002075,1,71.0,the little girl is obviously asking for a cook...,0,wls_female
...,...,...,...,...,...,...
195,631-0,0,74.0,the kids are in the cookies. the stool is fall...,1,pitts_male
196,242-2,0,65.0,we'll start with the girl. she's going to the....,1,pitts_male
197,071-3,0,,is it action if she has a finger in her nose h...,1,pitts_male
198,034-2,0,,everything i see in action or everything i. in...,1,pitts_male


In [21]:
d2[(d2['gender'] == 1) & (d2['source'] == 1)]

Unnamed: 0,file,gender,age,text,source,target
82,142-1,1,59.0,anything that i want. okay. the boy is reachin...,1,pitts_female
83,211-1,1,67.0,do you want me to start. the water is running ...,1,pitts_female
84,128-2,1,72.0,oh i remember this one. this is great. okay. t...,1,pitts_female
85,021-3,1,,alright. mother's doing the dishes. the sink i...,1,pitts_female
86,140-0,1,58.0,doesn't matter where you start then right. alr...,1,pitts_female
87,141-1,1,55.0,well the girl the boy is handing the girl cook...,1,pitts_female
88,211-2,1,68.0,a little girl is reaching for her brother to g...,1,pitts_female
89,172-0,1,72.0,okay mother is drying the dishes but the water...,1,pitts_female
90,013-0,1,62.0,somebody's getting cookies outof the cookie ja...,1,pitts_female
91,124-1,1,67.0,okay. water running outof the sink. lady dryin...,1,pitts_female


In [52]:
((d2.gender==1) & (d2.source == 1)).sum()

0

In [9]:
X, y = np.arange(10).reshape((5, 2)), range(5)

In [11]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0, random_state=42)

ValueError: test_size=0 should be either positive and smaller than the number of samples 5 or a float in the (0, 1) range