In [5]:
!pip install --upgrade pip


Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/a4/6d/6463d49a933f547439d6b5b98b46af8742cc03ae83543e4d7688c2420f8b/pip-21.3.1-py3-none-any.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 3.2MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.1.1
    Uninstalling pip-19.1.1:
      Successfully uninstalled pip-19.1.1
Successfully installed pip-21.3.1


In [6]:
!pip install scikit-multilearn


Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
     |████████████████████████████████| 89 kB 2.7 MB/s             
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [7]:
import os
import pandas as pd
import numpy as np

import skmultilearn.model_selection as ms


import itertools
import datetime


In [8]:
def get_linked_id( df, cust_rel, N_iter):
        df = df[['id']].drop_duplicates()
        df['linked_id'] = df['id'].copy()

        print(f'Initial number of unique Ids: {df.shape[0]}')
        for i in range(N_iter):
            df = expand_single_step(df, cust_rel)
            df_linked = df.sort_values(['id', 'linked_id']).drop_duplicates(subset='id', keep='first')
            n_unique_groups = df_linked['linked_id'].unique().shape[0]
            print(f'Number of unique Id groups after iteration {i + 1}: {n_unique_groups}')

        ind = df_linked["linked_id"].isna()
        df_linked.loc[ind, 'linked_id'] = df_linked.loc[ind, 'id']
        return df_linked

def expand_single_step(df, cust_rel):
        df = df.merge(cust_rel, left_on='linked_id', right_on='owner_cust_id', how='left')[
            ['id', 'linked_id', 'account_cust_id']]
       # link owner_id back to account_id
        df = df.merge(cust_rel, on='account_cust_id', how='left')[['id', 'owner_cust_id']]
        df = df.rename(columns={'owner_cust_id': 'linked_id'})
        return df.drop_duplicates()

In [9]:
def split_train_test(ft_df: pd.DataFrame, test_size, seed, stratify_on):
  
  if seed is not None: np.random.seed(seed)
  df = ft_df.copy()
  stratify_lst = stratify_on.copy()

  col_object = df[stratify_lst].select_dtypes(include=[object]).columns
  tmp_cols = []
  for col in col_object:
    df[f"{col}Tmp"] = df[col].astype('category').cat.codes
    stratify_lst.remove(col)
    stratify_lst.append(f"{col}Tmp")
    tmp_cols.append(f"{col}Tmp")

  # Stratefied split
  x_train, _, x_test, _ = ms.iterative_train_test_split(df.values, df[stratify_lst].values, test_size = test_size)

  df_train = pd.DataFrame(x_train, columns = df.columns).drop(tmp_cols, axis = 1)
  df_test = pd.DataFrame(x_test, columns = df.columns).drop(tmp_cols, axis = 1)

  return df_train, df_test

In [10]:
cust_rel = [
            ["1", "1", "IND"],["2", "2", "IND"],["3", "3", "IND"], ["4", "4", "IND"], ["5", "5", "IND"],
            ["6", "6", "IND"], ["7", "7", "IND"], ["8", "8", "IND"],  ["9", "9", "IND"], ["10", "10", "IND"],
            ["2", "3", "JOINT"], ["2", "4", "JOINT"],["3", "4", "JOINT"], ["4", "5", "JOINT"],
            ["6", "7", "JOINT"],
            ["8", "9", "JOINT"], ["8", "10", "JOINT"], ["9", "10", "JOINT"],
        ]
cust_rel_columns = ["owner_cust_id", "account_cust_id", "relationship"]

df = [
            ["1", "F", "1"], ["2", "F", "0"],["3", "M", "1"], ["4", "M", "0"], ["5", "M", "1"],
            ["6", "M", "0"], ["7", "M", "0"], ["8", "F", "1"], ["9", "F", "1"], ["10", "M", "1"],
        ]
df_struct_type = [ "id","Gender","Adult"]


cust_relDF =pd.DataFrame(cust_rel, columns=cust_rel_columns)

df =pd.DataFrame(df, columns=df_struct_type)

In [11]:
df_linkedid = get_linked_id(df, cust_relDF,3)

df_linkedid_nodu = df_linkedid.drop_duplicates('linked_id', keep="last")
df_linkedid_du = df_linkedid.drop(df_linkedid_nodu.index)

Initial number of unique Ids: 10
Number of unique Id groups after iteration 1: 5
Number of unique Id groups after iteration 2: 4
Number of unique Id groups after iteration 3: 4


In [14]:
df_group = df_linkedid_nodu.merge(df, on='id', how='left')
train, test = split_train_test(df_group, test_size=0.51, seed=1, stratify_on=["Gender","Adult"])

In [15]:
test_du = df_linkedid_du[df_linkedid_du['linked_id'].isin(test['linked_id'])].merge(df, on='id', how='left')
test_plus = pd.concat([test, test_du])
train_du = df_linkedid_du[df_linkedid_du['linked_id'].isin(train['linked_id'])].merge(df, on='id', how='left')
train_plus = pd.concat([train, train_du])


In [16]:
df_train = train_plus.assign(Split = "train")
df_test = test_plus.assign(Split = "test")
combine = pd.concat([df_train, df_test])
pd.crosstab(combine['Split'], combine['Gender'])

Gender,F,M
Split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,2,3
train,2,3


In [17]:
pd.crosstab(combine['Split'], combine['Adult'])

Adult,0,1
Split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,2,3
train,2,3
