In [5]:
!pip install --upgrade pip


Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/a4/6d/6463d49a933f547439d6b5b98b46af8742cc03ae83543e4d7688c2420f8b/pip-21.3.1-py3-none-any.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 3.2MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.1.1
    Uninstalling pip-19.1.1:
      Successfully uninstalled pip-19.1.1
Successfully installed pip-21.3.1


In [6]:
!pip install scikit-multilearn


Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
     |████████████████████████████████| 89 kB 2.7 MB/s             
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [7]:
import os
import pandas as pd
import numpy as np

import skmultilearn.model_selection as ms


import itertools
import datetime


In [8]:
def get_linked_id( df, cust_rel, N_iter):
        df = df[['id']].drop_duplicates()
        df['linked_id'] = df['id'].copy()

        print(f'Initial number of unique Ids: {df.shape[0]}')
        for i in range(N_iter):
            df = expand_single_step(df, cust_rel)
            df_linked = df.sort_values(['id', 'linked_id']).drop_duplicates(subset='id', keep='first')
            n_unique_groups = df_linked['linked_id'].unique().shape[0]
            print(f'Number of unique Id groups after iteration {i + 1}: {n_unique_groups}')

        ind = df_linked["linked_id"].isna()
        df_linked.loc[ind, 'linked_id'] = df_linked.loc[ind, 'id']
        return df_linked

def expand_single_step(df, cust_rel):
        df = df.merge(cust_rel, left_on='linked_id', right_on='owner_cust_id', how='left')[
            ['id', 'linked_id', 'account_cust_id']]
       # link owner_id back to account_id
        df = df.merge(cust_rel, on='account_cust_id', how='left')[['id', 'owner_cust_id']]
        df = df.rename(columns={'owner_cust_id': 'linked_id'})
        return df.drop_duplicates()

In [9]:
def split_train_test(ft_df: pd.DataFrame, test_size, seed, stratify_on):
  
  if seed is not None: np.random.seed(seed)
  df = ft_df.copy()
  stratify_lst = stratify_on.copy()

  col_object = df[stratify_lst].select_dtypes(include=[object]).columns
  tmp_cols = []
  for col in col_object:
    df[f"{col}Tmp"] = df[col].astype('category').cat.codes
    stratify_lst.remove(col)
    stratify_lst.append(f"{col}Tmp")
    tmp_cols.append(f"{col}Tmp")

  # Stratefied split
  x_train, _, x_test, _ = ms.iterative_train_test_split(df.values, df[stratify_lst].values, test_size = test_size)

  df_train = pd.DataFrame(x_train, columns = df.columns).drop(tmp_cols, axis = 1)
  df_test = pd.DataFrame(x_test, columns = df.columns).drop(tmp_cols, axis = 1)

  return df_train, df_test

In [105]:
data = [
            ["1", "P", "N",0], ["2", "O", "A",1],["3", "O", "N",1], ["4", "O", "A",1], ["5", "P", "A",1],
            ["6", "O", "A",5], ["7", "O", "N",5], ["8", "P", "N",7], ["9", "P", "A",7], ["10", "O", "N",7],
        ]

df =pd.DataFrame(data, columns=["id","Type","Status","Group"])
df

Unnamed: 0,id,Type,Status,Group
0,1,P,N,0
1,2,O,A,1
2,3,O,N,1
3,4,O,A,1
4,5,P,A,1
5,6,O,A,5
6,7,O,N,5
7,8,P,N,7
8,9,P,A,7
9,10,O,N,7


In [106]:

df_sample = df.drop_duplicates('Group', keep="first")
df_du = df.drop(df_sample.index)

In [107]:
df_sample

Unnamed: 0,id,Type,Status,Group
0,1,P,N,0
1,2,O,A,1
5,6,O,A,5
7,8,P,N,7


In [108]:
df_du

Unnamed: 0,id,Type,Status,Group
2,3,O,N,1
3,4,O,A,1
4,5,P,A,1
6,7,O,N,5
8,9,P,A,7
9,10,O,N,7


In [109]:

train, test = split_train_test(df_sample, test_size=0.5, seed=10, stratify_on=["Type","Status"])

In [110]:
test

Unnamed: 0,id,Type,Status,Group
0,1,P,N,0
1,2,O,A,1


In [111]:
test_du = df_du[df_du['Group'].isin(test['Group'])]

test_plus = pd.concat([test, test_du])
train_du = df_du[df_du['Group'].isin(train['Group'])]
train_plus = pd.concat([train, train_du])


In [112]:
test_plus

Unnamed: 0,id,Type,Status,Group
0,1,P,N,0
1,2,O,A,1
2,3,O,N,1
3,4,O,A,1
4,5,P,A,1


In [113]:
train_plus

Unnamed: 0,id,Type,Status,Group
0,6,O,A,5
1,8,P,N,7
6,7,O,N,5
8,9,P,A,7
9,10,O,N,7


In [114]:
df_train = train_plus.assign(Split = "train")
df_test = test_plus.assign(Split = "test")
combine = pd.concat([df_train, df_test])
pd.crosstab(combine['Split'], combine['Type'])

Type,O,P
Split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,3,2
train,3,2


In [115]:
pd.crosstab(combine['Split'], combine['Status'])

Status,A,N
Split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,3,2
train,2,3
