In [9]:
import itertools
import random
import pandas as pd
import numpy as np

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

# Updated weights for each of the 7 questions
question_weights = {
    "Q1": 0.25,  # Sleep Schedule
    "Q2": 0.10,  # Naps During the Day
    "Q3": 0.20,  # Noise Preference
    "Q4": 0.15,  # Socializing (Inviting Guests Over Frequently)
    "Q5": 0.10,  # Cleaning Habit
    "Q6": 0.10,  # Organization
    "Q7": 0.10   # Conflict Resolution
}

def generate_student_data_with_noise(num_students, noise_std=0.01):
    """
    Generates random 0-1 answers for 7 questions for the given number of students.
    Computes compatibility scores for all possible pairings with added Gaussian noise.
    Ensures scores remain within the [0,1] range.
    """
    # Generate student IDs
    students = [f"Student_{i+1}" for i in range(num_students)]
    
    # Generate random 0-1 answers for each student (7 binary features per student)
    student_answers = {student: [random.randint(0, 1) for _ in range(7)] for student in students}

    # Compute compatibility scores for all pairings with noise
    compatibility_scores = {}
    
    for student1, student2 in itertools.combinations(students, 2):
        answers1 = student_answers[student1]
        answers2 = student_answers[student2]
        
        # Base compatibility score (weighted sum of matching answers)
        base_score = sum(weight for (a, b), weight in zip(zip(answers1, answers2), question_weights.values()) if a == b)
        
        # Add Gaussian noise (mean 0, std noise_std) and ensure values remain in [0,1]
        noisy_score = np.clip(base_score + np.random.normal(0, noise_std), 0, 1)
        
        # Store the score in a single direction to avoid duplicate storage
        compatibility_scores[(student1, student2)] = noisy_score

    # Convert to DataFrame for better visualization and usability
    df_students = pd.DataFrame.from_dict(student_answers, orient='index', columns=question_weights.keys())
    df_compatibility = pd.DataFrame(
        [(s1, s2, score) for (s1, s2), score in compatibility_scores.items()],
        columns=["Student 1", "Student 2", "Compatibility Score"]
    )

    return df_students, df_compatibility, compatibility_scores

In [10]:
# Generate datasets for different numbers of students
df_students_6, df_compatibility_6, compatibility_scores_6 = generate_student_data_with_noise(6, noise_std=0.01)
df_students_10, df_compatibility_10, compatibility_scores_10 = generate_student_data_with_noise(10, noise_std=0.01)
df_students_20, df_compatibility_20, compatibility_scores_20 = generate_student_data_with_noise(20, noise_std=0.01)
df_students_30, df_compatibility_30, compatibility_scores_30 = generate_student_data_with_noise(30, noise_std=0.01)
df_students_40, df_compatibility_40, compatibility_scores_40 = generate_student_data_with_noise(40, noise_std=0.01)
df_students_50, df_compatibility_50, compatibility_scores_50 = generate_student_data_with_noise(50, noise_std=0.01)

# Save datasets as CSV files for later use
df_students_6.to_csv("data/students_6.csv", index=True)
df_compatibility_6.to_csv("data/compatibility_6.csv", index=False)

df_students_10.to_csv("data/students_10.csv", index=True)
df_compatibility_10.to_csv("data/compatibility_10.csv", index=False)

df_students_20.to_csv("data/students_20.csv", index=True)
df_compatibility_20.to_csv("data/compatibility_20.csv", index=False)

df_students_30.to_csv("data/students_30.csv", index=True)
df_compatibility_30.to_csv("data/compatibility_30.csv", index=False)

df_students_40.to_csv("data/students_40.csv", index=True)
df_compatibility_40.to_csv("data/compatibility_40.csv", index=False)

df_students_50.to_csv("data/students_50.csv", index=True)
df_compatibility_50.to_csv("data/compatibility_50.csv", index=False)

## Generate symmetric Matrix

In [3]:
import itertools
import random
import pandas as pd
import numpy as np

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

# Updated weights for each of the 7 questions
question_weights = {
    "Q1": 0.25,  # Sleep Schedule
    "Q2": 0.10,  # Naps During the Day
    "Q3": 0.20,  # Noise Preference
    "Q4": 0.15,  # Socializing (Inviting Guests Over Frequently)
    "Q5": 0.10,  # Cleaning Habit
    "Q6": 0.10,  # Organization
    "Q7": 0.10   # Conflict Resolution
}

def generate_student_data_with_noise(num_students, noise_std=0.01):
    """
    Generates random 0-1 answers for 7 questions for the given number of students.
    Computes compatibility scores for all possible pairings with added Gaussian noise.
    Ensures scores remain within the [0,1] range and enforces symmetry in the compatibility matrix.
    """
    # Generate student IDs
    students = [f"Student_{i+1}" for i in range(num_students)]
    
    # Generate random 0-1 answers for each student (7 binary features per student)
    student_answers = {student: [random.randint(0, 1) for _ in range(7)] for student in students}

    # Compute compatibility scores for all pairings with noise
    compatibility_scores = {}
    
    for student1, student2 in itertools.combinations(students, 2):
        answers1 = student_answers[student1]
        answers2 = student_answers[student2]
        
        # Base compatibility score (weighted sum of matching answers)
        base_score = sum(weight for (a, b), weight in zip(zip(answers1, answers2), question_weights.values()) if a == b)
        
        # Add Gaussian noise (mean 0, std noise_std) and ensure values remain in [0,1]
        noisy_score = np.clip(base_score + np.random.normal(0, noise_std), 0, 1)
        
        # Store the score symmetrically in both (student1, student2) and (student2, student1)
        compatibility_scores[(student1, student2)] = noisy_score
        compatibility_scores[(student2, student1)] = noisy_score  # Ensure symmetry

    # Convert to DataFrame for better visualization and usability
    df_students = pd.DataFrame.from_dict(student_answers, orient='index', columns=question_weights.keys())
    df_compatibility = pd.DataFrame(
        [(s1, s2, score) for (s1, s2), score in compatibility_scores.items()],
        columns=["Student 1", "Student 2", "Compatibility Score"]
    )

    return df_students, df_compatibility, compatibility_scores


In [5]:
# Generate datasets for different numbers of students
df_students_6, df_compatibility_6, compatibility_scores_6 = generate_student_data_with_noise(6, noise_std=0.01)
df_students_10, df_compatibility_10, compatibility_scores_10 = generate_student_data_with_noise(10, noise_std=0.01)
df_students_20, df_compatibility_20, compatibility_scores_20 = generate_student_data_with_noise(20, noise_std=0.01)
df_students_50, df_compatibility_50, compatibility_scores_50 = generate_student_data_with_noise(50, noise_std=0.01)

# Save datasets as CSV files for later use
df_students_6.to_csv("data/students_6.csv", index=True)
df_compatibility_6.to_csv("data/compatibility_6.csv", index=False)

df_students_10.to_csv("data/students_10.csv", index=True)
df_compatibility_10.to_csv("data/compatibility_10.csv", index=False)

df_students_20.to_csv("data/students_20.csv", index=True)
df_compatibility_20.to_csv("data/compatibility_20.csv", index=False)

df_students_50.to_csv("data/students_50.csv", index=True)
df_compatibility_50.to_csv("data/compatibility_50.csv", index=False)