# Similarity and Complementary Clustering

In [1]:
!ls

AnmeldungCoLearningC_DATA_2023-02-22_1020.csv
AnmeldungCoLearningC_DATA_LABELS_2023-02-22_1045.csv
Innosuisse grouping.ipynb


In [20]:
import pandas as pd
import random
import numpy as np

import pandas as pd
from scipy.spatial.distance import euclidean, pdist, squareform
import seaborn as sns
from sklearn.cluster import SpectralClustering


random.seed(123)

### Load data and translate columns

In [12]:
df = pd.read_csv('AnmeldungCoLearningC_DATA_2023-02-22_1020.csv', sep = ';')
df.head()

Unnamed: 0,record_id,redcap_survey_identifier,anmeldung_colearning_circles_timestamp,ee2,ee3,ee4,kom21,komm7,komm11,komm5,...,job1___5,job1___6,job1___7,job1___8,job1___9,zuf,zuf_2,bitte_listen_sie_alle_weit,beweg,anmeldung_colearning_circles_complete
0,1,,[not completed],,,,,,,,...,0,0,0,0,0,,,,,0
1,2,,[not completed],1.0,,,,,,,...,0,0,0,0,0,,,,,0
2,3,,[not completed],1.0,,,,,,,...,0,0,0,0,0,,,,,0
3,4,,[not completed],1.0,,,,,,,...,0,0,0,0,0,,,,,0
4,5,,[not completed],1.0,,,,,,,...,0,0,0,0,0,,,,,0


In [13]:
df_labels = pd.read_csv('AnmeldungCoLearningC_DATA_LABELS_2023-02-22_1045.csv', sep = ';')
list(df_labels.columns)

['Record ID',
 'Survey Identifier',
 'Survey Timestamp',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Schwierige Gespräche führen',
 'Verständliche und überzeugende Präsentationen halten',
 'Verhandlungen vorbereiten und durchführen',
 'Mit Feedback Entwicklung anstossen',
 'Neue Teammitglieder onboarden und ins Team integrieren',
 'Teams durch organisatorische Veränderungen führen',
 'Mit Personal-/Fachkräftemangel umgehen',
 'Konflikte im Team ansprechen und lösen',
 'Effiziente Suche nach Informationen von vertrauenswürdigen Quellen im Internet',
 'Virtuelle Workshops moderieren',
 'Effektive virtuelle Zusammenarbeit im Team',
 'Professionelles Netzwerk online pflegen und ausbauen',
 'Welche der folgenden Kompetenzen würden Sie gerne weiterentwickeln? Falls Sie mehrere Kompetenzen weiterentwickeln möchten, wählen Sie bitte die Kompetenz aus, bei denen Ihr Interesse an Weiterentwicklung am grössten ist.',
 'Schwierige Gespräche führen.1',
 'Verständliche und überzeugende Präsentati

In [14]:
df_translated = df_labels
df_translated.columns = ['Record ID',
 'Survey Identifier',
 'Survey Timestamp',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Conduct Difficult Conversations',
 'Giving Understandable and Persuasive Presentations',
 'Preparing and conducting negotiations',
 'Nudging development with feedback',
 'Onboarding new team members and integrating them into the team',
 'Leading teams through organizational change',
 'Dealing with staff/skill shortages',
 'Addressing and resolving team conflicts',
 'Finding information efficiently from trusted sources on the Internet',
 'Facilitating virtual workshops',
 'Effective virtual team collaboration',
 'Maintaining and expanding professional network online',
 'Which of the following competencies would you like to develop further?',
 'Conducting difficult conversations.1',
 'Giving understandable and convincing presentations.1',
 'Preparing and conducting negotiations.1',
 'Nudging development with feedback.1',
 'Onboarding new team members and integrating them into the team.1',
 'Leading teams through organizational change.1',
 'Dealing with staff and skill shortages',
 'Addressing and resolving team conflicts.1',
 'Finding information efficiently from trusted sources on the Internet.1',
 'Facilitating virtual workshops.1',
 'Effective virtual team collaboration.1',
 'Maintain and expand professional network online.1',
 'Do you have a preference in the size of the CoLearning Circle?',
 'Mondays (choice=8am-12pm)',
 'Mondays (choice=12pm-5pm)',
 'Mondays (choice=5pm-8pm)',
 'Mondays (choice=No availability)',
 'Tuesdays (choice=8am-12pm)',
 'Tuesdays (choice=12pm-5pm)',
 'Tuesdays (choice=5pm-8pm)',
 'Tuesdays (choice=No availability)',
 'Wednesdays (choice=8am-12pm)',
 'Wednesdays (choice=12pm-5pm)',
 'Wednesdays (choice=5pm-8pm)',
 'Wednesdays (choice=No availability)',
 'Thursdays (choice=8am-12pm)',
 'Thursdays (choice=12pm-5pm)',
 'Thursdays (choice=5pm-8pm)',
 'Thursdays (choice=No availability)',
 'Fridays (choice=8am-12pm)',
 'Fridays (choice=12pm-5pm)',
 'Fridays (choice=5pm-8pm)',
 'Fridays (choice=No availability)',
 'Saturdays (choice=8am-12pm)',
 'Saturdays (choice=12pm-5pm)',
 'Saturdays (choice=5pm-8pm)',
 'Saturdays (choice=No availability)',
 'Sundays (choice=8am-12pm)',
 'Sundays (choice=12pm-5pm)',
 'Sundays (choice=5pm-8pm)',
 'Sundays (choice=No availability)',
 'On what start dates could you participate in a CoLearning Pilot? (Duration from start date: 4-6 weeks) (choice=March 2023)',
 'On which start dates could you participate in a CoLearning Pilot? (Duration from start date: 4-6 weeks) (choice=April 2023)',
 'On which start dates could you participate in a CoLearning Pilot? (Duration from start date: 4-6 weeks) (choice=May 2023)',
 'On which start dates could you participate in a CoLearning Pilot? (Duration from start date: 4-6 weeks) (choice=June 2023)',
 'Email address',
 'Please indicate your age. ',
 'What gender do you feel you belong to?',
 'What is your highest level of education?',
 'Employment',
'In which sector of the economy do you work?  Multiple answers possible (choice=banks/financial institutions)',
 'In which economic sector do you work?  Multiple answers possible (choice=construction / real estate)',
 'In which economic sector do you work?  Multiple answers possible (choice=education)',
 'In which economic sector do you work?  Multiple answers possible (choice=chemical / pharmaceutical)',
 'In which economic sector do you work?  Multiple answers possible (choice=Consulting)',
 'In which economic sector do you work?  Multiple answers possible (choice=Detail / Wholesale)',
 'In which economic sector do you work?  Multiple answers possible (choice=Services, general)',
 'In which economic sector do you work?  Multiple answers possible (choice=energy / water industry)',
 'In which economic sector do you work?  Multiple answers possible (choice=hospitality / hotel industry)',
 'In which economic sector do you work?  Multiple answers possible (choice=health care / social services)',
 'In which economic sector do you work?  Multiple answers possible (choice=trade / crafts in general)',
 'In which economic sector do you work?  Multiple answers possible (choice=industry diverse)',
 'In which economic sector do you work?  Multiple answers possible (choice=Informatics / Telecommunications)',
 'In which economic sector do you work?  Multiple answers possible (choice=consumer / luxury goods industry)',
 'In which economic sector do you work?  Multiple answers possible (choice=agriculture / forestry)',
 'In which economic sector do you work?  Multiple answers possible (choice=mechanical / plant engineering)',
 'In which economic sector do you work?  Multiple answers possible (choice=medical technology)',
 'In which economic sector do you work?  Multiple answers possible (choice=public administration / associations)',
 'In which economic sector do you work?  Multiple answers possible (choice=Tourism / Leisure)',
 'In which economic sector do you work?  Multiple answers possible (choice=Transportation / Logistics)',
 'In which economic sector do you work?  Multiple answers possible (choice=Other)',
 'In which economic sector do you work?  Multiple answers possible (choice=not applicable/no answer)',
 'What is your current or last job title (e.g., project manager:in, chemical lab technician:in)?',
 'How would you categorize your current or most recent job function? (choice=unskilled job)',
 'How would you categorize your current or last job function? (choice=clerk:in)',
 'How would you categorize your current or last job function? (choice=Specialist:in)',
 'How would you categorize your current or last job function? (choice=Specialist:in)',
 'How would you categorize your current or last job function? (choice=lower management level (e.g. team leader))',
 'How would you categorize your current or last job function? (choice=Middle management level (e.g. department / division management))',
 'How would you categorize your current or last job function? (choice=Upper management level (e.g., member of executive management))',
 'How would you categorize your current or last job function? (choice=Other)',
 'How would you categorize your current or last job function? (choice=Not specified)',
 'How satisfied are you with the current professional development opportunities offered by your organization (your employer, institution, school)?',
 'How satisfied are you with the current offerings that exist in your organization for sharing among employees? ',
 'Please list all formal continuing education courses you have participated in over the past 3 years.',
 'What made you decide to sign up for a CoLearning Circle?',
 'Complete?']

In [15]:
print(len(df_translated))
df_translated =df_translated[~df_translated['Conduct Difficult Conversations'].isna()] 
print(len(df_translated))

50
28


## 0) Simulate two groups from experiment

In [16]:
df_translated['treatment'] = [random.randint(0, 1) for i in range(len(df_translated)) ]
df_translated['treatment'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_translated['treatment'] = [random.randint(0, 1) for i in range(len(df_translated)) ]


0    16
1    12
Name: treatment, dtype: int64

## 1) Group by similarity

### Questions for Innosuisse
1. Check that first are competences and then interests. 
2. Similarity: similar learning interests. But "competence level shouldn't be too low (> 3)" you mean interest levels?
 
* For now we assume that first are the competences and then the interests.
* And if the interest is low (1,2 or 3) we assume the person is not interested. 

In [17]:
## Similarity 
df_similar = df_translated[df_translated['treatment']==1]
len(df_similar)

12

In [21]:
df_similar_interests = df_similar[['Conducting difficult conversations.1',
 'Giving understandable and convincing presentations.1',
 'Preparing and conducting negotiations.1',
 'Nudging development with feedback.1',
 'Onboarding new team members and integrating them into the team.1',
 'Leading teams through organizational change.1',
 'Dealing with staff and skill shortages',
 'Addressing and resolving team conflicts.1',
 'Finding information efficiently from trusted sources on the Internet.1',
 'Facilitating virtual workshops.1',
 'Effective virtual team collaboration.1',
 'Maintain and expand professional network online.1']].fillna(-1)

m_interests = np.matrix(df_similar_interests)

# Constrain: competence level shouldn't be too low (> 3) 
# Solution: If the competence level is below or equal to 3, we assume that it is not there.
m_interests[m_interests<=3] = -1

In [22]:

# Constrain: difference among persons shouldn''t be too high (max 3)
def get_labels(m_interests, metric= 'euclidean',gamma = 0.01):
    distance = squareform(pdist(m_interests, metric=metric))
    similarity = np.exp(-gamma * distance ** 2)
    labels = SpectralClustering(n_clusters=2,random_state=123,affinity='precomputed').fit_predict(similarity)
    return labels

In [23]:
def get_labels(m_interests, metric= 'euclidean',gamma = 0.01):
    distance = squareform(pdist(m_interests, metric=metric))
    similarity = np.exp(-gamma * distance ** 2)
    labels = SpectralClustering(n_clusters=2,random_state=123,affinity='precomputed').fit_predict(similarity)
    return labels

def hierarchical_spectral(m_matrix , labels, metric= 'euclidean',gamma = 0.01, offset = 2 ):    
    
    unique_labels, counts_labels = np.unique(labels, return_counts=True)
    print(unique_labels, counts_labels)
    labels_to_explore = unique_labels

    while len(labels_to_explore)>= 1:
        print('Labels to explore: ', labels_to_explore)
        i = labels_to_explore[0]
        mask = labels==i
        size = np.sum(mask)
        if size >= 4:
            # see if we can split
            m_aux = m_matrix[mask]
            new_labels = get_labels(m_aux, metric, gamma) + offset

            new_unique_labels, new_counts_labels = np.unique(new_labels, return_counts=True)
            if np.min(new_counts_labels)>=2:
                # We acept the clustering, we upload the labels 
                labels[mask] = new_labels 
                
                offset += 2 # We move the offset
                # We add the new labels as labels to explore to the list
                labels_to_explore = np.union1d(labels_to_explore, np.unique(new_labels))
                labels_to_explore = np.setdiff1d(labels_to_explore, np.array([i]))
                print(f'Adding labels and removes current {i}. Updated list: {labels_to_explore}')
                unique_labels, counts_labels = np.unique(labels, return_counts=True)
                print(f"Unique labels {unique_labels} with counts {counts_labels}")
            else:
                # We do not accept, keep the group as it is. Not divisible. 
                labels_to_explore = np.setdiff1d(labels_to_explore, np.array([i]))
                print(f"We do not accept the new labels. Counts {new_counts_labels}")
                print('Removing labels: ', labels_to_explore)
        else:
            labels_to_explore = np.setdiff1d(labels_to_explore, np.array([i]))
            print(f"Small group for {i}. Size = {size}")
            print('Removing labels: ', labels_to_explore)


    return labels

def get_similarity_clusters(m_matrix, metric= 'euclidean',gamma = 0.01):
    distance = squareform(pdist(m_matrix, metric=metric))
    similarity = np.exp(-gamma * distance ** 2)
    labels = SpectralClustering(n_clusters=2,random_state=123,affinity='precomputed').fit_predict(similarity)
    
    new_labels = hierarchical_spectral(m_matrix , labels, metric= 'euclidean',gamma = 0.01)
    return new_labels

In [24]:
labels = get_similarity_clusters(m_interests , metric= 'euclidean',gamma = 0.01)
df_similar['labels_similarity']  = labels
labels

[0 1] [10  2]
Labels to explore:  [0 1]
Adding labels and removes current 0. Updated list: [1 2 3]
Unique labels [1 2 3] with counts [2 7 3]
Labels to explore:  [1 2 3]
Small group for 1. Size = 2
Removing labels:  [2 3]
Labels to explore:  [2 3]
We do not accept the new labels. Counts [1 6]
Removing labels:  [3]
Labels to explore:  [3]
Small group for 3. Size = 3
Removing labels:  []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_similar['labels_similarity']  = labels


array([2, 2, 2, 3, 3, 1, 3, 2, 1, 2, 2, 2], dtype=int32)

In [25]:
df_similar['labels_similarity'].value_counts()

2    7
3    3
1    2
Name: labels_similarity, dtype: int64

In [27]:
df_similar_available = df_similar[[
 'Mondays (choice=8am-12pm)',
 'Mondays (choice=12pm-5pm)',
 'Mondays (choice=5pm-8pm)',
 'Mondays (choice=No availability)',
 'Tuesdays (choice=8am-12pm)',
 'Tuesdays (choice=12pm-5pm)',
 'Tuesdays (choice=5pm-8pm)',
 'Tuesdays (choice=No availability)',
 'Wednesdays (choice=8am-12pm)',
 'Wednesdays (choice=12pm-5pm)',
 'Wednesdays (choice=5pm-8pm)',
 'Wednesdays (choice=No availability)',
 'Thursdays (choice=8am-12pm)',
 'Thursdays (choice=12pm-5pm)',
 'Thursdays (choice=5pm-8pm)',
 'Thursdays (choice=No availability)',
 'Fridays (choice=8am-12pm)',
 'Fridays (choice=12pm-5pm)',
 'Fridays (choice=5pm-8pm)',
 'Fridays (choice=No availability)',
 'Saturdays (choice=8am-12pm)',
 'Saturdays (choice=12pm-5pm)',
 'Saturdays (choice=5pm-8pm)',
 'Saturdays (choice=No availability)',
 'Sundays (choice=8am-12pm)',
 'Sundays (choice=12pm-5pm)',
 'Sundays (choice=5pm-8pm)',
 'Sundays (choice=No availability)']]

df_similar_available = df_similar_available.replace('Unchecked',0).replace('Checked',1)
m_available = np.array(df_similar_available)

In [28]:
available_labels = hierarchical_spectral(m_available , labels, 
                                         metric= 'hamming',gamma = 0.01, offset=10)
df_similar['available_labels'] = available_labels

[1 2 3] [2 7 3]
Labels to explore:  [1 2 3]
Small group for 1. Size = 2
Removing labels:  [2 3]
Labels to explore:  [2 3]
Adding labels and removes current 2. Updated list: [ 3 10 11]
Unique labels [ 1  3 10 11] with counts [2 3 3 4]
Labels to explore:  [ 3 10 11]
Small group for 3. Size = 3
Removing labels:  [10 11]
Labels to explore:  [10 11]
Small group for 10. Size = 3
Removing labels:  [11]
Labels to explore:  [11]
Adding labels and removes current 11. Updated list: [12 13]
Unique labels [ 1  3 10 12 13] with counts [2 3 3 2 2]
Labels to explore:  [12 13]
Small group for 12. Size = 2
Removing labels:  [13]
Labels to explore:  [13]
Small group for 13. Size = 2
Removing labels:  []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_similar['available_labels'] = available_labels


In [37]:
df_similar['available_labels'].value_counts()

10    3
3     3
12    2
1     2
13    2
Name: available_labels, dtype: int64

In [38]:
df_similar[['treatment', 'available_labels','labels_similarity','Record ID']]

Unnamed: 0,treatment,available_labels,labels_similarity,Record ID
19,1,10,2,20
23,1,10,2,24
24,1,12,2,25
28,1,3,3,29
29,1,3,3,30
30,1,1,1,31
34,1,3,3,35
35,1,13,2,36
39,1,1,1,40
41,1,10,2,42


In [39]:
df_similar[['available_labels']].value_counts()

available_labels
3                   3
10                  3
1                   2
12                  2
13                  2
dtype: int64

## Complementary

In [43]:
## Similarity 
df_comp = df_translated[df_translated['treatment']==0]
len(df_comp)

16

In [44]:
m_comp_interest =  np.array(df_comp[['Conducting difficult conversations.1',
 'Giving understandable and convincing presentations.1',
 'Preparing and conducting negotiations.1',
 'Nudging development with feedback.1',
 'Onboarding new team members and integrating them into the team.1',
 'Leading teams through organizational change.1',
 'Dealing with staff and skill shortages',
 'Addressing and resolving team conflicts.1',
 'Finding information efficiently from trusted sources on the Internet.1',
 'Facilitating virtual workshops.1',
 'Effective virtual team collaboration.1',
 'Maintain and expand professional network online.1']].fillna(-1))

m_comp_strenghts = np.array(df_comp[['Conduct Difficult Conversations',
 'Giving Understandable and Persuasive Presentations',
 'Preparing and conducting negotiations',
 'Nudging development with feedback',
 'Onboarding new team members and integrating them into the team',
 'Leading teams through organizational change',
 'Dealing with staff/skill shortages',
 'Addressing and resolving team conflicts',
 'Finding information efficiently from trusted sources on the Internet',
 'Facilitating virtual workshops',
 'Effective virtual team collaboration',
 'Maintaining and expanding professional network online']].fillna(-1))

In [48]:
from scipy.spatial import distance


def get_distance_matrix(m_aux_strenghts, m_aux_interest):
    num_participants = m_aux_strenghts.shape[0]
    print(f" participants {num_participants}")
    # create matrix
    distance_results = np.zeros([num_participants,num_participants], dtype=np.float64)
    for i in range(num_participants):
        # get interest and strength
        interest_i = m_aux_interest[i]
        strength_i = m_aux_strenghts[i]
        for j in range(i+1, num_participants):
                strength_j = m_aux_strenghts[j]
                interest_j = m_aux_interest[j]
                dist_a = distance.euclidean(interest_i, strength_j)
                dist_b = distance.euclidean(interest_j, strength_i)
                dist = np.mean([dist_a, dist_b])
                distance_results[i,j] = dist
                distance_results[j,i] = dist
    return distance_results

def get_labels_comp(m_aux_strenghts, m_aux_interest, metric= 'euclidean',gamma = 0.01):
    distance_matrix_comp = get_distance_matrix(m_aux_strenghts, m_aux_interest)
    similarity = np.exp(-gamma * distance_matrix_comp ** 2)
    labels = SpectralClustering(n_clusters=2,random_state=123,affinity='precomputed').fit_predict(similarity)
    return labels

def hierarchical_spectral_comp(m_comp_strenghts, m_comp_interest , labels, 
                               metric= 'euclidean',gamma = 0.01, offset_num = 2 ):    
    offset = offset_num
    unique_labels, counts_labels = np.unique(labels, return_counts=True)
    print(unique_labels, counts_labels)
    labels_to_explore = unique_labels

    while len(labels_to_explore)>= 1:
        print('Labels to explore: ', labels_to_explore)
        i = labels_to_explore[0]
        mask = labels==i
        print(f"labels {labels}")
        size = np.sum(mask)
        print(f'Exploring label {i}, with size {size}')
        if size >= 4:
            # see if we can split
            m_aux_strenghts = m_comp_strenghts[mask]
            m_aux_interest = m_comp_interest[mask]
            
            new_labels = get_labels_comp(m_aux_strenghts, m_aux_interest, metric, gamma) + offset
            new_unique_labels, new_counts_labels = np.unique(new_labels, return_counts=True)
            
            if np.min(new_counts_labels)>=2:
                # We acept the clustering, we upload the labels 
                labels[mask] = new_labels 
                offset += 2 # We move the offset
                # We add the new labels as labels to explore to the list
                labels_to_explore = np.union1d(labels_to_explore, np.unique(new_labels))
                labels_to_explore = np.setdiff1d(labels_to_explore, np.array([i]))
                print(f'Adding labels and removes current {i}. Updated list: {labels_to_explore}')
                unique_labels, counts_labels = np.unique(labels, return_counts=True)
                print(f"Unique labels {unique_labels} with counts {counts_labels}")
            else:
                # We do not accept, keep the group as it is. Not divisible. 
                labels_to_explore = np.setdiff1d(labels_to_explore, np.array([i]))
                print(f"We do not accept the new labels. Counts {new_counts_labels}")
                print('Removing labels: ', labels_to_explore)
        else:
            labels_to_explore = np.setdiff1d(labels_to_explore, np.array([i]))
            print(f"Small group for {i}. Size = {size}")
            print('Removing labels: ', labels_to_explore)


    return labels



In [49]:

        
def get_contrasting_clusters(m_comp_strenghts, m_comp_interest, metric= 'euclidean',gamma = 0.01):
    distance_matrix = get_distance_matrix(m_comp_strenghts, m_comp_interest)
    similarity = np.exp(-gamma * distance_matrix ** 2)
    labels = SpectralClustering(n_clusters=2,random_state=123,affinity='precomputed').fit_predict(similarity)
    
    new_labels = hierarchical_spectral_comp(m_comp_strenghts, m_comp_interest , labels, metric= 'euclidean',gamma = 0.01)
    return new_labels

In [50]:
labels = get_contrasting_clusters(m_comp_strenghts, m_comp_interest, metric= 'euclidean',gamma = 0.01)

 participants 16
[0 1] [12  4]
Labels to explore:  [0 1]
labels [0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1]
Exploring label 0, with size 12
 participants 12
Adding labels and removes current 0. Updated list: [1 2 3]
Unique labels [1 2 3] with counts [4 9 3]
Labels to explore:  [1 2 3]
labels [2 2 1 3 2 1 2 2 3 2 3 2 2 1 2 1]
Exploring label 1, with size 4
 participants 4
Adding labels and removes current 1. Updated list: [2 3 4 5]
Unique labels [2 3 4 5] with counts [9 3 2 2]
Labels to explore:  [2 3 4 5]
labels [2 2 5 3 2 5 2 2 3 2 3 2 2 4 2 4]
Exploring label 2, with size 9
 participants 9
Adding labels and removes current 2. Updated list: [3 4 5 6 7]
Unique labels [3 4 5 6 7] with counts [3 2 2 5 4]
Labels to explore:  [3 4 5 6 7]
labels [6 7 5 3 6 5 7 7 3 7 3 6 6 4 6 4]
Exploring label 3, with size 3
Small group for 3. Size = 3
Removing labels:  [4 5 6 7]
Labels to explore:  [4 5 6 7]
labels [6 7 5 3 6 5 7 7 3 7 3 6 6 4 6 4]
Exploring label 4, with size 2
Small group for 4. Size = 2
Removing

In [51]:
len(labels)

16

In [52]:
df_comp['labels_complementary']  = labels
labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comp['labels_complementary']  = labels


array([6, 7, 5, 3, 6, 5, 7, 7, 3, 7, 3, 6, 6, 4, 6, 4], dtype=int32)

In [53]:
df_comp['labels_complementary'].value_counts()

6    5
7    4
3    3
5    2
4    2
Name: labels_complementary, dtype: int64

In [55]:
df_comp_available = df_comp[[
 'Mondays (choice=8am-12pm)',
 'Mondays (choice=12pm-5pm)',
 'Mondays (choice=5pm-8pm)',
 'Mondays (choice=No availability)',
 'Tuesdays (choice=8am-12pm)',
 'Tuesdays (choice=12pm-5pm)',
 'Tuesdays (choice=5pm-8pm)',
 'Tuesdays (choice=No availability)',
 'Wednesdays (choice=8am-12pm)',
 'Wednesdays (choice=12pm-5pm)',
 'Wednesdays (choice=5pm-8pm)',
 'Wednesdays (choice=No availability)',
 'Thursdays (choice=8am-12pm)',
 'Thursdays (choice=12pm-5pm)',
 'Thursdays (choice=5pm-8pm)',
 'Thursdays (choice=No availability)',
 'Fridays (choice=8am-12pm)',
 'Fridays (choice=12pm-5pm)',
 'Fridays (choice=5pm-8pm)',
 'Fridays (choice=No availability)',
 'Saturdays (choice=8am-12pm)',
 'Saturdays (choice=12pm-5pm)',
 'Saturdays (choice=5pm-8pm)',
 'Saturdays (choice=No availability)',
 'Sundays (choice=8am-12pm)',
 'Sundays (choice=12pm-5pm)',
 'Sundays (choice=5pm-8pm)',
 'Sundays (choice=No availability)']]

df_comp_available = df_comp_available.replace('Unchecked',0).replace('Checked',1)
m_available_comp = np.array(df_comp_available)

In [56]:
available_labels = hierarchical_spectral(m_available_comp , labels, 
                                         metric= 'hamming',gamma = 0.01, offset=10)
df_comp['available_labels'] = available_labels

[3 4 5 6 7] [3 2 2 5 4]
Labels to explore:  [3 4 5 6 7]
Small group for 3. Size = 3
Removing labels:  [4 5 6 7]
Labels to explore:  [4 5 6 7]
Small group for 4. Size = 2
Removing labels:  [5 6 7]
Labels to explore:  [5 6 7]
Small group for 5. Size = 2
Removing labels:  [6 7]
Labels to explore:  [6 7]
Adding labels and removes current 6. Updated list: [ 7 10 11]
Unique labels [ 3  4  5  7 10 11] with counts [3 2 2 4 3 2]
Labels to explore:  [ 7 10 11]
We do not accept the new labels. Counts [3 1]
Removing labels:  [10 11]
Labels to explore:  [10 11]
Small group for 10. Size = 3
Removing labels:  [11]
Labels to explore:  [11]
Small group for 11. Size = 2
Removing labels:  []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comp['available_labels'] = available_labels


In [58]:
df_comp['available_labels'].value_counts()

7     4
10    3
3     3
5     2
11    2
4     2
Name: available_labels, dtype: int64

In [59]:
df_similar[['treatment', 'available_labels','labels_similarity','Record ID']].head(2)

Unnamed: 0,treatment,available_labels,labels_similarity,Record ID
19,1,10,2,20
23,1,10,2,24


In [60]:
df_similar['study_group'] = df_similar['available_labels'] + 1000 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_similar['study_group'] = df_similar['available_labels'] + 1000


In [61]:
df_comp[['treatment', 'available_labels','labels_complementary','Record ID']].head(2)

Unnamed: 0,treatment,available_labels,labels_complementary,Record ID
18,0,10,6,19
22,0,7,7,23


In [62]:
df_comp['study_group'] = df_comp['available_labels'] + 2000 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comp['study_group'] = df_comp['available_labels'] + 2000


In [63]:
df_groups = pd.concat([df_comp[['treatment', 'study_group','Record ID']], df_similar[['treatment', 'study_group','Record ID']]], axis=0)

In [64]:
df_groups['treatment'] = df_groups['treatment'].replace(0, 'complementary').replace(1, 'similar')

In [65]:
# add a column with number of members per team
df_groups.sort_values('study_group')

Unnamed: 0,treatment,study_group,Record ID
39,similar,1001,40
30,similar,1001,31
34,similar,1003,35
29,similar,1003,30
28,similar,1003,29
41,similar,1010,42
23,similar,1010,24
19,similar,1010,20
24,similar,1012,25
44,similar,1012,45


In [66]:
df_groups['study_group'].value_counts()

2007    4
2010    3
2003    3
1010    3
1003    3
2005    2
2011    2
2004    2
1012    2
1001    2
1013    2
Name: study_group, dtype: int64

In [67]:
df_groups['treatment'].value_counts()

complementary    16
similar          12
Name: treatment, dtype: int64