In [50]:
pip install pandas numpy scikit-learn sentence-transformers




In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# ── 1. Load data
df = pd.read_csv('/indian_dummy_user_data.csv')

# ── 2. Ensure all necessary text columns exist
survey_cols = ['Vibe', 'Cleaning', 'Chatting', 'Noise Comfort', 'Work Setup', 'Tone']
for col in survey_cols:
    if col not in df.columns:
        df[col] = ""

# ── 3. Combine all text responses into one field
df['full_text'] = df[survey_cols].fillna('').apply(lambda row: " | ".join(row.astype(str)), axis=1)

# ── 4. Structured (categorical) features
structured_cols = ['Diet', 'Personality', 'Sleep Habit', 'Noise Tolerance', 'Smoke Alcohol']
enc_struct = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_struct_raw = enc_struct.fit_transform(df[structured_cols].astype(str))

# Normalize structured data
struct_scaler = MinMaxScaler()
X_struct = struct_scaler.fit_transform(X_struct_raw)

# ── 5. Encode full_text using SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
emb_struct = model.encode(df['full_text'].tolist(), convert_to_numpy=True, normalize_embeddings=True)

# ── 6. Compatibility scoring between 2 users
def compatibility_score(idx_a, idx_b, w=(0.7, 0.3)):
    xa, xb = emb_struct[idx_a:idx_a+1], emb_struct[idx_b:idx_b+1]
    xs, ys = X_struct[idx_a:idx_a+1], X_struct[idx_b:idx_b+1]
    t_sim = cosine_similarity(xa, xb)[0][0]
    s_sim = cosine_similarity(xs, ys)[0][0]
    return round(float(w[0] * t_sim + w[1] * s_sim) * 100, 1)

# ── 7. Example: Top matches for a user at index 0
base_idx = 5
scores = [(i, compatibility_score(base_idx, i)) for i in range(len(df)) if i != base_idx]
scores = sorted(scores, key=lambda x: x[1], reverse=True)
print(f"Top matches for user #{base_idx}:")
for i, sc in scores[:5]:
    print(f"• {df.loc[i, 'First Name']} {df.loc[i, 'Last Name']}: {sc}% match")

# ── 8. New user scoring + match explanations
def score_new_user(user_q: dict, user_struct: dict, text_w=0.7, struct_w=0.3):
    # Build user text and embed
    utt = " | ".join([user_q.get(k, "") for k in survey_cols])
    emb_user = model.encode([utt], normalize_embeddings=True)[0]

    # Structured encoding
    vec_struct = enc_struct.transform([[user_struct.get(k, "") for k in structured_cols]])
    vec_struct_scaled = struct_scaler.transform(vec_struct)

    # Similarities
    sim_text = cosine_similarity(emb_user.reshape(1, -1), emb_struct)[0]
    sim_struct = cosine_similarity(vec_struct_scaled.reshape(1, -1), X_struct)[0]
    final = np.clip(sim_text, 0, 1) * text_w + sim_struct * struct_w

    # Explanation of shared structured traits
    reasons = []
    for i in range(len(df)):
        shared = [col for col in structured_cols
                  if user_struct.get(col, "").lower() == str(df.loc[i, col]).lower()]
        reason = "Similar in: " + (", ".join(shared) if shared else "Mostly vibe-based match")
        reasons.append(reason)

    result = pd.DataFrame({
        'First Name': df['First Name'],
        'Last Name': df['Last Name'],
        'Compatibility (%)': np.round(final * 100, 1),
        'Vibe': df['Vibe'],
        'Why Match?': reasons
    }).sort_values('Compatibility (%)', ascending=False)

    return result.reset_index(drop=True)

# ── Example new user
new_q = {
    'Vibe': 'creative and messy',
    'Cleaning': 'super organized',
    'Chatting': 'prefers quiet space',
    'Noise Comfort': 'soft like a library',
    'Work Setup': 'office commute daily',
    'Tone': 'friendly and chill'
}
new_struct = {
    'Diet': 'Vegetarian',
    'Personality': 'Ambivert',
    'Sleep Habit': 'Early Bird',
    'Noise Tolerance': 'Low',
    'Smoke Alcohol': 'No'
}

# Run match
top = score_new_user(new_q, new_struct)
print("\nTop 10 matches for the new user:")
print(top.head(10))


  return forward_call(*args, **kwargs)


Top matches for user #5:
• Bhavin Singhal: 94.0% match
• Vedika Krish: 94.0% match
• Anahita Kurian: 94.0% match
• Amira Kala: 94.0% match
• Biju Mangal: 94.0% match

Top 10 matches for the new user:
  First Name      Last Name  Compatibility (%) Vibe  \
0      Anahi          Arora               37.2        
1       Riya          Manda               37.2        
2       Urvi          Shere               37.2        
3        Ela  Krishnamurthy               37.2        
4      Kabir            Dar               37.2        
5      Zaina           Kaur               37.2        
6      Gatik     Srinivasan               37.2        
7  Dharmajan      Choudhury               29.5        
8      Jayan           Toor               29.5        
9     Himmat          Kumar               29.5        

                                          Why Match?  
0  Similar in: Personality, Noise Tolerance, Smok...  
1  Similar in: Personality, Noise Tolerance, Smok...  
2  Similar in: Personality, N

  return forward_call(*args, **kwargs)


In [54]:
import os
import csv
import numpy as np
import soundfile as sf
import pandas as pd

# 1. Create folder structure
root = "SarthiApp_demo"
voice_dir = os.path.join(root, "voice_samples")
os.makedirs(voice_dir, exist_ok=True)

# 2. Define sample dataset rows
rows = [
    ["Riya", "Goel", "creative and messy", "super organized", "prefers quiet space", "soft like a library", "office commute daily", "friendly and chill", "Vegetarian", "Ambivert", "Early Bird", "Low", "No", "voice_samples/user_0.wav"],
    ["Ananya", "Sharma", "fun-loving and active", "ok with a little mess", "chatty and sociable", "lively and vibrant", "WFH setup", "enthusiastic and expressive", "Non-Vegetarian", "Extrovert", "Night Owl", "High", "Yes", "voice_samples/user_1.wav"],
    ["Neha", "Verma", "calm and focused", "clean and tidy", "quiet", "calm and cozy", "hybrid worker", "calm and mature", "Jain", "Introvert", "Early Bird", "Medium", "No", "voice_samples/user_2.wav"],
    ["Mihir", "Patel", "laid back", "cleaner than most", "loves deep convos", "white noise fan", "remote full-time", "chill and funny", "Eggitarian", "Ambivert", "Night Owl", "High", "No", "voice_samples/user_3.wav"],
    ["Simran", "Kaur", "quiet but warm", "super clean", "soft talker", "needs pin drop silence", "hybrid setup", "shy but sweet", "Vegan", "Introvert", "Early Bird", "Low", "No", "voice_samples/user_4.wav"],
    ["Arjun", "Mehta", "energized and loud", "not very tidy", "talks a lot", "noisy with music", "WFH with music pump", "loud and extrovert", "Non-Vegetarian", "Extrovert", "Night Owl", "Very High", "Yes", "voice_samples/user_5.wav"],
]

csv_path = os.path.join(root, "sample_user_data.csv")

# 3. Write CSV
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    header = ["First Name","Last Name","Vibe","Cleaning","Chatting","Noise Comfort","Work Setup","Tone","Diet","Personality","Sleep Habit","Noise Tolerance","Smoke Alcohol","Voice Path"]
    writer.writerow(header)
    writer.writerows(rows)

# 4. Generate synthetic .wav files (sine waves at different frequencies)
sr = 16000
duration = 3.0

for idx, freq in enumerate([200, 300, 400, 500, 600, 700]):
    t = np.linspace(0, duration, int(sr * duration), endpoint=False)
    sine = 0.5 * np.sin(2 * np.pi * freq * t)
    path = os.path.join(root, "voice_samples", f"user_{idx}.wav")
    sf.write(path, sine, sr)

print("Sample data + voice .wav files created under:", root)


Sample data + voice .wav files created under: SarthiApp_demo


In [57]:
!pip install speechbrain torchaudio


Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.14-py3-none-any.whl.metadata (24 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading speechbrain-1.0.3-py3-none-any.whl (864 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Downloading ruamel.yaml-0.18.14-py3-none-any.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.6/118.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.clib-0.2.12-cp311-cp

In [60]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from speechbrain.pretrained import EncoderClassifier
import torchaudio

# Load sample dataset
df = pd.read_csv("SarthiApp_demo/sample_user_data.csv")

# Build structured features
survey_cols = ['Vibe', 'Cleaning', 'Chatting', 'Noise Comfort', 'Work Setup', 'Tone']
structured_cols = ['Diet','Personality','Sleep Habit','Noise Tolerance','Smoke Alcohol']

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_struct = enc.fit_transform(df[structured_cols].astype(str))
X_struct = MinMaxScaler().fit_transform(X_struct)

# Text embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
emb_text = model.encode(df[survey_cols].fillna('').apply(lambda row: " | ".join(row), axis=1).tolist(),
                        convert_to_numpy=True, normalize_embeddings=True)

# Voice embeddings from SpeechBrain
voice_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

def extract_voice_emb(path):
    signal, fs = torchaudio.load(path)
    if fs != 16000:
        signal = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)(signal)
    emb = voice_model.encode_batch(signal).squeeze().detach().cpu().numpy()
    return emb / np.linalg.norm(emb)

voice_emb = []
for p in df['Voice Path']:
    full = os.path.join("SarthiApp_demo", p)
    if os.path.exists(full):
        voice_emb.append(extract_voice_emb(full))
    else:
        voice_emb.append(np.zeros((192,)))
voice_emb = np.array(voice_emb)

# Scoring functions
def comp(idx_a, idx_b):
    t = cosine_similarity(emb_text[idx_a:idx_a+1], emb_text[idx_b:idx_b+1])[0][0]
    s = cosine_similarity(X_struct[idx_a:idx_a+1], X_struct[idx_b:idx_b+1])[0][0]
    v = cosine_similarity(voice_emb[idx_a:idx_a+1], voice_emb[idx_b:idx_b+1])[0][0]
    return round((0.5*t + 0.3*s + 0.2*v) * 100, 1)

# Example: top matches for user 0
scores = [(i, comp(0, i)) for i in range(len(df)) if i != 0]
scores = sorted(scores, key=lambda x: x[1], reverse=True)
print("Top matches for user 5:")
for i, sc in scores[:5]:
    print(f"• {df.loc[i,'First Name']} {df.loc[i,'Last Name']}: {sc}%")


  return forward_call(*args, **kwargs)
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Fetching files for pretraining (no collection directory set)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["

Top matches for user 5:
• Neha Verma: 60.5%
• Simran Kaur: 59.7%
• Mihir Patel: 57.0%
• Ananya Sharma: 45.1%
• Arjun Mehta: 42.7%
