# Patient Overlap | Data Leckage

- PatientID appears both in `training` and `validation` sets


In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set()

## Traing and Validation Dataset

In [3]:
# Read Dataset
train_df = pd.read_csv('Dataset/train-small.csv')
print('Shape of train Dataframe: ',train_df.shape)

valid_df = pd.read_csv('Dataset/valid-small.csv')
print('Shape of validation Dataframe: ',valid_df.shape)

Shape of train Dataframe:  (1000, 16)
Shape of validation Dataframe:  (109, 16)


### Overlaps

In [4]:
# 1. Extract Patient Ids for train and val
pid_train = train_df.PatientId.values
pid_valid = valid_df.PatientId.values

# 2. Convert Array -> Set {collection of unique instance of elements}
train_set_pid = set(pid_train)
valid_set_pid = set(pid_valid)

print('Unique Patients Train: ', len(train_set_pid))
print('Unique Patients Valid: ', len(valid_set_pid))

# 3. Intersection of those two sets => Overlaping patients
overlap_pid = list(train_set_pid.intersection(valid_set_pid))
n_overlap = len(overlap_pid)

print(f'\nOverlaping PIDs:\n{overlap_pid}')
print('No. of Overlaps:', n_overlap)

Unique Patients Train:  928
Unique Patients Valid:  97

Overlaping PIDs:
[20290, 27618, 9925, 10888, 22764, 19981, 18253, 4461, 28208, 8760, 7482]
No. of Overlaps: 11


### Identify and Remove from either(train, valid) set.

In [5]:
# Accumulate indices of overlaps
train_olap_indices = []
valid_olap_indices = []

for idx in range (n_overlap):
    train_olap_indices.extend(train_df.index[train_df['PatientId'] == overlap_pid[idx]].tolist())
    valid_olap_indices.extend(valid_df.index[valid_df['PatientId'] == overlap_pid[idx]].tolist())
    
print(f'Overlaping indices in Train Set:\n{train_olap_indices}')
print(f'\nOverlaping indices in Valid Set:\n{valid_olap_indices}')

Overlaping indices in Train Set:
[306, 186, 797, 98, 408, 917, 327, 913, 10, 51, 276]

Overlaping indices in Valid Set:
[104, 88, 65, 13, 2, 41, 56, 70, 26, 75, 20, 52, 55]


In [6]:
# Drop the indices from validation set
valid_df.drop(valid_olap_indices, inplace=True)

### New Train and Valid set

In [7]:
# Values in Valid set
valid_pid = valid_df.PatientId.values
# Unique Values in Valid set
ids_valid_set = set(valid_pid)

print('Unique patient Valid set:',len(ids_valid_set))

Unique patient Valid set: 86


In [8]:
# Check Overlap
overlap_pid = list(train_set_pid.intersection(ids_valid_set))
n_overlap = len(overlap_pid)

print(f'There are {n_overlap} Overlaping PatientIDs in Train and Valid sets')

There are 0 Overlaping PatientIDs in Train and Valid sets


***