In [1]:
# pip install -U sentence-transformers

In [2]:
# pip install tf-keras

In [3]:
# pip install accelerate

In [4]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from accelerate import init_empty_weights

2025-04-08 17:18:40.756503: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
df = pd.read_excel('value structure 2022-2024.xlsx')

In [6]:
df.head()

Unnamed: 0,id_ego,Person Name,From,To,NoMeaningfulEdges,Operations,Data
0,7023.0,Josh Adler,strength,ecstasy,1.0,,2022 A
1,7023.0,Josh Adler,strength,discovery,1.0,,2022 A
2,7023.0,Josh Adler,strength,bravery,1.0,,2022 A
3,7023.0,Josh Adler,strength,calm,1.0,,2022 A
4,7023.0,Josh Adler,calm,connected to the universe,1.0,,2022 A


# User-Values Dict

In [7]:
# Step 1: Basic cleaning function (same as before)
def clean_text(text):
    if pd.isna(text):
        return None
    text = str(text).strip().lower()
    text = re.sub(r'[^a-zA-Z0-9\s\-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Step 2: Clean From and To columns
df['From_clean'] = df['From'].apply(clean_text)
df['To_clean'] = df['To'].apply(clean_text)

# Step 3: Build individual_values dictionary
individual_values = defaultdict(set)

for _, row in df.iterrows():
    pid = str(row['id_ego']).strip()
    name = row['Person Name'].strip()
    key = (pid, name)
    
    from_val = row['From_clean']
    to_val = row['To_clean']
    
    if from_val: individual_values[key].add(from_val)
    if to_val: individual_values[key].add(to_val)

# Convert sets to sorted lists
individual_values = {
    k: sorted(list(v)) for k, v in individual_values.items()
}

# Preview:
for k, v in list(individual_values.items())[:5]:
    print(f"{k}: {v}")

('7023.0', 'Josh Adler'): ['bravery', 'calm', 'connected to the universe', 'discovery', 'ecstasy', 'growth', 'strength', 'zen']
('7053.0', 'Mohit Agarwal'): ['empathy', 'family', 'gratitude', 'make things better', 'organization', 'peace', 'professionalism', 'quality']
('7051.0', 'Jamie Bergos'): ['beauty', 'connection to the universe', 'excellence', 'purpose', 'relationships', 'resilience', 'service', 'strong work ethic']
('7059.0', 'Jeremy Browning'): ['achievement', 'beauty', 'family', 'health', 'knowledge', 'legacy', 'respect', 'self awareness']
('7059.0', 'Jeremy Browni+D372'): ['beauty', 'knowledge']


# Top Values

In [8]:
# group by id_ego
def get_top_values(subdf):
    from_set = set(subdf['From'].dropna())
    to_set = set(subdf['To'].dropna())
    top_values = to_set - from_set
    return top_values

df_top = df.groupby('id_ego').apply(get_top_values).reset_index()
df_top.columns = ['id_ego', 'top_values']
df_top = pd.DataFrame(df_top)

  df_top = df.groupby('id_ego').apply(get_top_values).reset_index()


In [9]:
df_top.head()

Unnamed: 0,id_ego,top_values
0,4000.0,{happiness}
1,4001.0,{stability}
2,4002.0,"{achievement, joy}"
3,4003.0,{inner peace}
4,4004.0,{happiness}


# Embeddings

In [10]:
# 10 concepts
concepts = ['accuracy', 'speed', 'vigilance', 'optimism', 'persistence', 
            'change', 'detailed', 'holistic', 'analytic', 'creative']

# all values
all_values = pd.concat([df['From'], df['To']]).dropna().unique()

# all values and concepts
all_phrases = list(set(list(all_values) + concepts))

# embed all values and concepts
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(all_phrases, convert_to_tensor=True)

# create a dictionary of all values and concepts
embedding_dict = dict(zip(all_phrases, embeddings))

# create a dataframe of all values and concepts
embedding_df = pd.DataFrame(embeddings.numpy(), index=all_phrases)
embedding_df = embedding_df.reset_index()
embedding_df = embedding_df.rename(columns={'index': 'phrase'})

In [11]:
embedding_df.head()

Unnamed: 0,phrase,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,self-esteem,-0.016603,0.088842,-0.000807,0.054129,-0.036652,-0.030265,0.119285,0.056749,0.037477,...,0.003607,0.020132,0.021875,0.102146,-0.017888,0.024328,0.090815,0.024054,0.024524,-0.060949
1,perseverance / resilience / growth mindset,0.097316,0.036831,-0.054521,-0.033749,0.007353,0.027244,0.040391,0.029388,-0.006299,...,0.071773,0.023278,0.02405,0.02218,-0.103774,0.015832,0.060365,-0.030772,-0.012773,0.018929
2,friends / family,-0.106732,0.040435,-0.002001,0.002275,0.006255,-0.013568,0.091841,0.019049,0.008936,...,0.028157,0.004332,-0.029443,0.069398,0.012834,-0.041058,0.110648,0.077042,-0.04043,0.013021
3,equality,-0.030622,0.136613,0.011752,0.046158,-0.059474,-0.024489,0.098906,-0.060738,0.052511,...,0.03746,-0.051372,0.060111,0.063894,0.020128,0.118888,0.089834,0.013412,0.090119,-0.023605
4,ease,0.004611,-0.004951,0.048202,0.068449,0.028766,0.03142,0.036592,0.040927,-0.114077,...,0.064859,0.032081,-0.006101,-0.004273,-0.007631,0.108985,0.019628,0.044703,0.067917,0.030597


# Cosine_distance

In [12]:
# Step 1: Separate value and concept embeddings from a combined dictionary
value_embeddings = {k: v for k, v in embedding_dict.items() if k not in concepts}
concept_embeddings = {k: v for k, v in embedding_dict.items() if k in concepts}

# Step 2: Build distance matrix from each value to each concept
# Convert dicts to matrix form
value_names = list(value_embeddings.keys())
concept_names = list(concept_embeddings.keys())

value_matrix = np.vstack([value_embeddings[k] for k in value_names])
concept_matrix = np.vstack([concept_embeddings[k] for k in concept_names])

# Step 3: Compute cosine distances
distance_matrix = cosine_distances(value_matrix, concept_matrix)

# Step 4: Create DataFrame with distances (rows = values, columns = concepts)
distance_df = pd.DataFrame(distance_matrix, columns=concept_names)
distance_df['value'] = value_names

concept_order = [
    'accuracy', 'speed',
    'vigilance', 'optimism',
    'persistence', 'change',
    'detailed', 'holistic',
    'analytic', 'creative'
]
cols = ['value'] + concept_order
distance_df = distance_df[cols]

In [13]:
distance_df.head()

Unnamed: 0,value,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,self-esteem,0.785013,0.800269,0.697333,0.607589,0.69932,0.825947,0.717717,0.764451,0.804444,0.844047
1,perseverance / resilience / growth mindset,0.908914,0.886947,0.696283,0.64525,0.634146,0.873669,0.905335,0.879627,0.908235,0.829355
2,friends / family,0.866188,0.710388,0.839124,0.795964,0.71232,0.751433,0.74135,0.781012,0.90725,0.728711
3,equality,0.795324,0.720196,0.884878,0.750735,0.788851,0.633895,0.790548,0.874019,0.756058,0.846418
4,ease,0.706264,0.588603,0.774066,0.777372,0.780702,0.662028,0.701401,0.765908,0.811612,0.650856


# Cosine_score

In [14]:
# Step 1: Define direction vectors using your definitions
direction_defs = {
    'speed': ('accuracy', 'speed'),
    'optimism': ('vigilance', 'optimism'),
    'change': ('persistence', 'change'),
    'holistic': ('detailed', 'holistic'),
    'creative': ('analytic', 'creative'),
}

# Step 2: Create lookup dictionary from embedding_df
value_embedding_dict = dict(
    zip(
        embedding_df['phrase'],
        embedding_df.drop(columns='phrase').values
    )
)

# Step 3: Construct direction vectors: direction(dim) = pos - neg
direction_vectors = {
    dim: value_embedding_dict[pos] - value_embedding_dict[neg]
    for dim, (neg, pos) in direction_defs.items()
}

In [15]:
# Step 1: Initialize a list to store each row of results
distance_rows = []

# Step 2: For each value, compute cosine distance to each psychological direction
for val in value_names:
    val_vec = value_embedding_dict[val]  # Get the embedding vector of the value
    row = {'value': val}  # Start a row with the value name
    
    for dim, dir_vec in direction_vectors.items():
        # Compute cosine distance between the value and the direction vector
        distance = cosine_distances([val_vec], [dir_vec])[0][0]
        row[dim] = distance  # Store the result under the dimension name
    
    # Add the row to the list
    distance_rows.append(row)

# Step 3: Convert the list of rows into a DataFrame
value_direction_df = pd.DataFrame(distance_rows)


In [16]:
value_direction_df.head()

Unnamed: 0,value,speed,optimism,change,holistic,creative
0,self-esteem,1.014182,0.918423,1.105987,1.037404,1.031559
1,perseverance / resilience / growth mindset,0.979581,0.953612,1.200482,0.979424,0.937142
2,friends / family,0.855176,0.960768,1.032737,1.031743,0.857724
3,equality,0.930165,0.878064,0.870301,1.066806,1.072007
4,ease,0.890628,1.003005,0.90067,1.051629,0.871895


# Individual_Score

In [17]:
# Step 1: Convert value_direction_df to lookup dictionary
value_score_dict = value_direction_df.set_index('value').to_dict(orient='index')
# {'self-harmony': {'speed': ..., 'optimism': ..., ...}, ...}

# Step 2: Prepare result list
individual_scores = []

# Step 3: For each individual, aggregate scores across their values
for (pid, name), values in individual_values.items():
    score_sum = {'speed': 0, 'optimism': 0, 'change': 0, 'holistic': 0, 'creative': 0}
    count = 0
    
    for val in values:
        if val in value_score_dict:
            for dim in score_sum:
                score_sum[dim] += value_score_dict[val][dim]
            count += 1
    
    # Compute average if count > 0
    if count > 0:
        avg_scores = {dim: score_sum[dim]/count for dim in score_sum}
    else:
        avg_scores = {dim: None for dim in score_sum}  # or use np.nan

    # Add ID and name
    avg_scores['id'] = pid
    avg_scores['name'] = name
    individual_scores.append(avg_scores)

# Step 4: Convert to DataFrame
individual_score_df = pd.DataFrame(individual_scores)

# Reorder columns
cols = ['id', 'name', 'speed', 'optimism', 'change', 'holistic', 'creative']
individual_score_df = individual_score_df[cols]


In [18]:
individual_score_df

Unnamed: 0,id,name,speed,optimism,change,holistic,creative
0,7023.0,Josh Adler,0.920541,1.005041,1.012527,1.021717,0.936188
1,7053.0,Mohit Agarwal,0.952482,0.956239,0.986481,1.023192,0.908079
2,7051.0,Jamie Bergos,0.948666,1.008218,1.066455,1.032304,0.932637
3,7059.0,Jeremy Browning,0.970015,0.995761,1.012033,1.054411,0.921182
4,7059.0,Jeremy Browni+D372,0.982139,0.963243,1.037261,1.082329,0.903842
...,...,...,...,...,...,...,...
720,10107.0,Christopher Morris,0.980903,0.962864,0.995594,1.057146,0.934673
721,10105.0,Busisiwe Mombaur,1.007522,0.945221,1.047138,1.113212,0.933475
722,,Max Quinn,0.934258,0.920721,1.009979,1.052446,0.922240
723,10125.0,Curtis Rose,0.949117,0.901995,1.062297,1.095718,0.888292


# Individual Distance

## Granular Level

In [26]:
# Step 1: Reshape the distance_df to long format
value_concept_wide = distance_df.melt(id_vars='value', 
                                            var_name='concept', 
                                            value_name='distance')

# Step 2: Map individuals to their values
rows = []

for (pid, name), values in individual_values.items():
    for val in values:
        matched_rows = value_concept_wide[value_concept_wide['value'] == val]
        for _, row in matched_rows.iterrows():
            rows.append({
                'id': pid,
                'name': name,
                'value': val,
                'concept': row['concept'],
                'distance': row['distance']
            })

# Step 3: Build long-format DataFrame
long_df = pd.DataFrame(rows)

# Step 4: Pivot into wide format (concepts as columns)
wide_df = long_df.pivot_table(
    index=['id', 'name', 'value'],
    columns='concept',
    values='distance'
).reset_index()

cols_dis = ['id', 'name', 'value'] + concept_order
wide_df = wide_df[cols_dis]


In [27]:
wide_df

concept,id,name,value,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,10001.0,Sevinj Abdullayeva,appreciation,0.758332,0.740392,0.648256,0.619869,0.828005,0.682074,0.737746,0.775258,0.737472,0.677993
1,10001.0,Sevinj Abdullayeva,balance,0.731530,0.730251,0.718066,0.691257,0.678201,0.703486,0.781066,0.793019,0.774812,0.749840
2,10001.0,Sevinj Abdullayeva,family,0.776483,0.675549,0.824004,0.749274,0.712656,0.712907,0.703903,0.803564,0.869543,0.741106
3,10001.0,Sevinj Abdullayeva,happiness,0.878913,0.679593,0.678123,0.460599,0.610629,0.708003,0.723332,0.730679,0.726232,0.631593
4,10001.0,Sevinj Abdullayeva,peace,0.802051,0.683757,0.625415,0.624267,0.735466,0.666284,0.675184,0.728380,0.816398,0.641782
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634,,Yuna Lee,impact,0.738506,0.680743,0.732874,0.751373,0.756305,0.642927,0.691633,0.891178,0.728901,0.745725
5635,,Yuna Lee,motivation,0.801043,0.774864,0.743445,0.599723,0.655406,0.743651,0.689161,0.802328,0.799160,0.680263
5636,,Yuna Lee,priority,0.713290,0.613894,0.776329,0.740348,0.685665,0.756895,0.678274,0.780979,0.831897,0.775832
5637,,Yuna Lee,self confidence,0.614771,0.814183,0.663327,0.508814,0.656593,0.892471,0.697159,0.827957,0.886822,0.827363


## Average Level

In [28]:
# Step: Group by 'name' and compute mean of each concept column
individual_concept_df = wide_df.groupby('name')[concept_order].mean().reset_index()

# Keep one id per name (if available)
id_lookup = wide_df[['name', 'id']].drop_duplicates()

# Merge back into the final profile
individual_concept_df = id_lookup.merge(individual_concept_df, on='name')


In [30]:
individual_concept_df.head()

concept,name,id,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,Sevinj Abdullayeva,10001.0,0.777084,0.717548,0.679767,0.62124,0.695295,0.721323,0.734092,0.771458,0.803635,0.712655
1,Elaff Abudawood,10002.0,0.822417,0.781983,0.76466,0.701804,0.742333,0.757479,0.756653,0.813099,0.866933,0.755664
2,Dilbar Ahmadli,10003.0,0.770278,0.733613,0.756028,0.706251,0.666908,0.722812,0.721563,0.804439,0.795864,0.753052
3,Felipe Alvarez,10005.0,0.785766,0.745513,0.701281,0.686291,0.710268,0.743212,0.742882,0.777441,0.821545,0.762805
4,Tatika Catipay,10008.0,0.773343,0.729919,0.782416,0.732539,0.7282,0.700765,0.70776,0.766011,0.79348,0.742489


## Alternative method (similar above; calculate average only)

In [31]:
value_distance_dict = distance_df.set_index('value').to_dict(orient = 'index')

In [33]:
individual_distance = []

for (pid,name), values in individual_values.items():
    distance_sum = {concept: 0 for concept in concept_order}
    count = 0

    for val in values:
        if val in value_distance_dict:
            for dim in distance_sum:
                distance_sum[dim] += value_distance_dict[val][dim]
            count += 1
    
    if count > 0:
        avg_distances = {dim: distance_sum[dim]/count for dim in distance_sum}
    else:
        avg_distances = {dim: None for dim in distance_sum}

    avg_distances['id'] = pid
    avg_distances['name'] = name
    individual_distance.append(avg_distances)

individual_distance_df = pd.DataFrame(individual_distance)

cols_dis = ['id', 'name'] + concept_order
individual_distance_df = individual_distance_df[cols_dis]

individual_distance_df.head()

Unnamed: 0,id,name,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,7023.0,Josh Adler,0.798643,0.713161,0.732056,0.737602,0.725871,0.740838,0.731905,0.75904,0.792275,0.712199
1,7053.0,Mohit Agarwal,0.784631,0.733512,0.723252,0.67511,0.722109,0.705958,0.716817,0.745794,0.827761,0.712411
2,7051.0,Jamie Bergos,0.785623,0.730399,0.719707,0.728747,0.684987,0.764384,0.721098,0.76146,0.816953,0.732421
3,7059.0,Jeremy Browning,0.744822,0.712565,0.725047,0.720383,0.690064,0.704441,0.68948,0.757464,0.810844,0.711937
4,7059.0,Jeremy Browni+D372,0.692667,0.673452,0.736032,0.695596,0.655274,0.699791,0.614007,0.716872,0.763041,0.642374
