In [22]:
import pandas as pd
from itertools import combinations
import re

In [31]:
import random
random.seed(0)


#because k choose 2 increases quadratically with k, we can't split every disease
#the same way if we expect the same split in the pairs dataset. This function returns
#the effective split, given a total number of images, n, in the category. For efficiency
#this only works for split under 0.5. Also does not work for n = 0, 1, or 2 as it does not make
#sense to call it in these cases.
def n_test(n, split):
  if n == 1 or n == 0 or n == 2:
    return 0
  min_diff = 2
  for i in range(int(n/2)+1):
    k = i+1

    kC2 = k*(k-1)/2
    restC2 = (n-k)*(n-k-1)/2

    diff = abs(split-kC2/(kC2+restC2))

    if diff<min_diff:
        min_diff = diff
        ntest = k

  return ntest

#path is the original data set labelled with ids and severities, split is the
#desired train test split the output are a set of test pairs, and a df of test
#pairs, and the same for train pairs.
def pairs_df(path, split):
  #extracting the data
  path = 'psoriasis_label.csv'
  df = pd.read_csv(path)
  df.sort_values(by='id', inplace=True, key=lambda col: col.str.lower())
  df = df.drop_duplicates(subset='id', keep='first')
  ids = df['id'].values
  severities = df['severity'].values

  l_test = []
  l_train = []
  curr = None
  count = 0

  for i, id in enumerate(ids):

    name = re.sub(r'\d+', '', id)
    name = re.sub(r'-', '', name)
    name = re.sub(r'.jpg', '', name).lower()


    if not curr:
      count += 1
      curr = id
    elif name == curr:
      count +=1
    elif count == 2:
      r = random.random()
      if r<split:
        test_subset = set(range(i-count,i))
        l_test_append = [set(j) for j in combinations(test_subset,2)]
        l_test.append(l_test_append)
        print(l_test_append)
      else:
        train_subset = set(range(i-count,i))
        l_train_append = [set(j) for j in combinations(train_subset,2)]
        l_train.append(l_train_append)
        print(l_train_append)
      curr = name
      count = 1
    else:
      set_of_name = set(range(i-count,i))

      ntest = n_test(count, split)
      test_subset = set(random.sample(list(set_of_name), ntest))
      train_subset = set_of_name - test_subset

      l_test_append = [set(j) for j in combinations(test_subset,2)]
      l_test.append(l_test_append)
      l_train_append = [set(j) for j in combinations(train_subset,2)]
      l_train.append(l_train_append)

      if count == 2: print(l_train_append, l_test_append)

      curr = name
      count = 1

  #putting the data in a new df: test
  d_test = {'ImageA':[],
       'ImageB':[],
       'SeverityA':[],
       'SeverityB': []}
  s_test = set()

  for sublist in l_test:
    for pair in sublist:
      index_1, index_2 = pair
      d_test['ImageA'].append(ids[index_1])
      d_test['ImageB'].append(ids[index_2])
      d_test['SeverityA'].append(severities[index_1])
      d_test['SeverityB'].append(severities[index_2])
      s_test.add((ids[index_1], ids[index_2]))

  df_test_out = pd.DataFrame(d_test)

  #putting the data in a new df: train
  d_train = {'ImageA':[],
       'ImageB':[],
       'SeverityA':[],
       'SeverityB': []}
  s_train = set()

  for sublist in l_train:
    for pair in sublist:
      index_1, index_2 = pair
      d_train['ImageA'].append(ids[index_1])
      d_train['ImageB'].append(ids[index_2])
      d_train['SeverityA'].append(severities[index_1])
      d_train['SeverityB'].append(severities[index_2])
      s_train.add((ids[index_1], ids[index_2]))

  df_train_out = pd.DataFrame(d_train)

  return s_test, df_test_out, s_train, df_train_out

In [32]:
n_test(3, 0.2)

1

In [33]:
"""
The sets produced here are sets of all pairs of images with the same name (same disease and bodypart)
"""
train_test_split = 0.2

path = 'psoriasis_label.csv'
s_test, df_test, s_train, df_train = pairs_df(path, train_test_split)

[{3, 4}]
[{16, 15}]
[{17, 18}]
[{24, 25}]
[{83, 84}]
[{248, 249}]
[{330, 331}]
[{442, 443}]


In [34]:
df_test

Unnamed: 0,ImageA,ImageB,SeverityA,SeverityB
0,08SebDermOK1021.jpg,08SebDermOK011021.jpg,4,4
1,lichen-planus-120.jpg,lichen-planus-127.jpg,3,12
2,lichen-planus-120.jpg,lichen-planus-15.jpg,3,6
3,lichen-planus-166.jpg,lichen-planus-120.jpg,7,3
4,lichen-planus-120.jpg,lichen-planus-198.jpg,3,7
...,...,...,...,...
435,psoriasis-scalp-48.jpg,psoriasis-scalp-93.jpg,6,6
436,psoriasis-scalp-63.jpg,psoriasis-scalp-70.jpg,6,9
437,psoriasis-scalp-63.jpg,psoriasis-scalp-93.jpg,6,6
438,psoriasis-scalp-70.jpg,psoriasis-scalp-93.jpg,9,6


In [36]:
df_train

Unnamed: 0,ImageA,ImageB,SeverityA,SeverityB
0,08PsoriasisOnycholysis.jpg,08PsoriasisOnycholysis1.jpg,4,2
1,08SebDermEAr.jpg,08sebDermEar092605.jpg,7,6
2,08SebDermEAr.jpg,08sebDermEar0926051.jpg,7,4
3,08sebDermEar092605.jpg,08sebDermEar0926051.jpg,6,4
4,08SebDermScalp.jpg,08sebDermScalp111308.jpg,7,5
...,...,...,...,...
1865,psoriasis-scalp-94.jpg,psoriasis-scalp-92.jpg,8,3
1866,Psoriasis-treatment-1.jpg,Psoriasis-treatment-8.jpg,5,3
1867,reiter-syndrome-21.jpg,reiter-syndrome-1.jpg,6,12
1868,reiter-syndrome-21.jpg,reiter-syndrome-14.jpg,6,3


In [37]:
df_test.to_csv('pairs_by_name_test.csv', sep=',', index=False)
df_train.to_csv('pairs_by_name_train.csv', sep=',', index=False)

In [21]:
import pickle

with open('pairs_by_names_set_test.pickle', 'wb') as handle:
    pickle.dump(s_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('pairs_by_names_set_train.pickle', 'wb') as handle:
    pickle.dump(s_train, handle, protocol=pickle.HIGHEST_PROTOCOL)