Imports - 

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.optimize import linear_sum_assignment

Data Loading - 

In [3]:
data = pd.read_excel('/Users/jaianshsinghbindra/Downloads/Roommate matching/Sample_Data.xlsx')

Label Handling - 

In [4]:
file_path = '/Users/jaianshsinghbindra/Downloads/Roommate matching/Sample_Data.xlsx'
data = pd.read_excel(file_path)
print(data.columns)

Index(['Name, Surname', 'Sex', 'What is your nationality(ies)',
       'What language(s) do you speak?', 'What is your sleeptime (weekdays)?',
       'What is your sleeptime (weekend)?', 'Does noise bother you?',
       'If at night from what time. Noise, if night what time',
       'How often are you willing to clean the common area?',
       'How do you rate your sharing habits?',
       'Do you clean your dishes right after using them?',
       'Do you mind if your roommate invites people to your flat?',
       'Would you invite people to your flat?',
       'How do you handle disagreements?', 'Are you a party-person?'],
      dtype='object')


In [5]:
data['Cleanliness'] = data['How often are you willing to clean the common area?'].apply(
    lambda x: 5 if x == 'Every day' else (
        4 if x == 'Few days per week' else (
            3 if x == '1 day per week' else 0
        )
    )
)
data['Dishes'] = data['Do you clean your dishes right after using them?'].apply(lambda x: 1 if x == 'Yes' else 0)
def noise_score(row):
    if row['Does noise bother you?'] == 'During the day':
        return 1
    elif row['Does noise bother you?'] == 'At night':
        time = row['If at night from what time. Noise, if night what time']
        return {'9pm': 2, '10pm': 3, '11pm': 4, 'midnight': 5, 'after midnight': 6}.get(time, 0)
    else:
        return 0

data['Noise Tolerance'] = data.apply(noise_score, axis=1)
data['Party Person'] = data['Are you a party-person?'].apply(lambda x: 1 if x == 'Yes' else 0)
data['Sleeptime Weekdays'] = data['What is your sleeptime (weekdays)?'].apply(
    lambda x: {'8 to 9pm': 1, '9 to 10pm': 2, '10 to 11pm': 3, '11 to midnight': 4, 'midnight to 2 am': 5, 'after 2am': 6}.get(x, 3)
)
data['Sleeptime Weekends'] = data['What is your sleeptime (weekend)?'].apply(
    lambda x: {'8 to 9pm': 1, '9 to 10pm': 2, '10 to 11pm': 3, '11 to midnight': 4, 'midnight to 2 am': 5, 'after 2am': 6}.get(x, 3)
)
data['French Speaker'] = data['What language(s) do you speak?'].apply(lambda x: 1 if 'French' in x else 0)
data['French National'] = data['What is your nationality(ies)'].apply(lambda x: 1 if 'French' in x else 0)
data['Nationality'] = data['What is your nationality(ies)'].apply(lambda x: x.split(', '))  # Split multiple nationalities into a list
data['Mind Invites'] = data['Do you mind if your roommate invites people to your flat?'].apply(lambda x: 1 if x == 'Yes' else 0)
data['Invite People'] = data['Would you invite people to your flat?'].apply(lambda x: 1 if x == 'Yes' else 0)
data['Handle Disagreements'] = data['How do you handle disagreements?'].apply(lambda x: {'Mediated discussion': 1, 'Confrontation': 2}.get(x, 0))
data['Sharing Habits'] = data['How do you rate your sharing habits?'] 

In [6]:
# Da, rememeber the more the weight, the more likely those people to be grouped together by the algo.
weights = {
    'Cleanliness': 5,
    'Dishes': 3,
    'Sleeptime Weekdays': 4,
    'Sleeptime Weekends': 4,
    'Noise Tolerance': 3,
    'Party Person': 3,
    'Nationality': 1,
    'Mind Invites': 3,
    'Invite People': 3,
    'Handle Disagreements': 2,
    'Sharing Habits': 3,  
}

In [7]:
def calculate_compatibility_score(student1, student2):
    score = 0
    for factor in weights:
        if factor == 'Nationality':  # Special handling for nationality
            common_nationalities = set(student1['Nationality']).intersection(set(student2['Nationality']))
            score += weights[factor] * (len(common_nationalities) > 0)
        else:
            score += weights[factor] * (student1[factor] == student2[factor])
    return score

In [8]:
num_students = len(data)
compatibility_matrix = np.zeros((num_students, num_students))

for i in range(num_students):
    for j in range(num_students):
        if i != j:
            compatibility_matrix[i, j] = calculate_compatibility_score(data.iloc[i], data.iloc[j])

In [9]:
kmeans = KMeans(n_clusters=num_students // 4).fit(compatibility_matrix)
data['Cluster'] = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [10]:
cost_matrix = np.zeros((num_students, num_students))

for i in range(num_students):
    for j in range(num_students):
        if i != j:
            # Apply constraints
            if data.iloc[i]['Sex'] != data.iloc[j]['Sex']:
                cost_matrix[i, j] = float('inf')
            elif data.iloc[i]['French National'] and data.iloc[j]['French National']:
                cost_matrix[i, j] = float('inf')
            elif data.iloc[i]['French Speaker'] and data.iloc[j]['French Speaker']:
                cost_matrix[i, j] += 8
            elif len(set(data.iloc[i]['Nationality']).intersection(set(data.iloc[j]['Nationality']))) > 0:
                cost_matrix[i, j] += 10
            else:
                cost_matrix[i, j] -= compatibility_matrix[i, j]

row_ind, col_ind = linear_sum_assignment(cost_matrix)

In [11]:
groups = {}
for i in range(0, len(row_ind), 4):
    group = []
    for j in range(4):
        if i + j < len(row_ind):
            group.append(data.iloc[row_ind[i + j]]['Name, Surname'])
    groups[f'Group {i // 4 + 1}'] = group

In [12]:
for group, members in groups.items():
    print(f"{group}: {', '.join(members)}")

Group 1: John Doe, Jane Smith, Alice Johnson, Bob Brown
Group 2: Charlie Davis, Eve Wilson, Frank Harris, Grace Lee


Save data to a file - 

In [17]:
output = pd.DataFrame.from_dict(groups, orient='index').transpose()
output.to_excel('room_allocation_results.xlsx', index=False)