In [1]:
# pip install -U sentence-transformers

In [2]:
# pip install tf-keras

In [3]:
# pip install accelerate

In [2]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from accelerate import init_empty_weights

2025-04-08 20:02:42.844261: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_excel('value structure 2022-2024.xlsx')

In [4]:
df.head()

Unnamed: 0,id_ego,Person Name,From,To,NoMeaningfulEdges,Operations,Data
0,7023.0,Josh Adler,strength,ecstasy,1.0,,2022 A
1,7023.0,Josh Adler,strength,discovery,1.0,,2022 A
2,7023.0,Josh Adler,strength,bravery,1.0,,2022 A
3,7023.0,Josh Adler,strength,calm,1.0,,2022 A
4,7023.0,Josh Adler,calm,connected to the universe,1.0,,2022 A


# User-Values Dict
**Output**
- individual_values (Dict)

In [5]:
# Data cleaning function
def clean_text(text):
    if pd.isna(text):
        return None
    text = str(text).strip().lower()
    text = re.sub(r'[^a-zA-Z0-9\s\-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Clean From and To columns
df['From_clean'] = df['From'].apply(clean_text)
df['To_clean'] = df['To'].apply(clean_text)

In [6]:
# Build individual_values dictionary
individual_values = defaultdict(set)

for _, row in df.iterrows():
    pid = str(row['id_ego']).strip()
    name = row['Person Name'].strip()
    key = (pid, name)
    
    from_val = row['From_clean']
    to_val = row['To_clean']
    
    if from_val: individual_values[key].add(from_val)
    if to_val: individual_values[key].add(to_val)

# Convert sets to sorted lists
individual_values = {
    k: sorted(list(v)) for k, v in individual_values.items()
}

In [7]:
# Preview:
for k, v in list(individual_values.items())[:5]:
    print(f"{k}: {v}")

('7023.0', 'Josh Adler'): ['bravery', 'calm', 'connected to the universe', 'discovery', 'ecstasy', 'growth', 'strength', 'zen']
('7053.0', 'Mohit Agarwal'): ['empathy', 'family', 'gratitude', 'make things better', 'organization', 'peace', 'professionalism', 'quality']
('7051.0', 'Jamie Bergos'): ['beauty', 'connection to the universe', 'excellence', 'purpose', 'relationships', 'resilience', 'service', 'strong work ethic']
('7059.0', 'Jeremy Browning'): ['achievement', 'beauty', 'family', 'health', 'knowledge', 'legacy', 'respect', 'self awareness']
('7059.0', 'Jeremy Browni+D372'): ['beauty', 'knowledge']


# User Top Values
**Output**
- df_top (DataFrame)

In [10]:
# group by id_ego
def get_top_values(subdf):
    from_set = set(subdf['From'].dropna())
    to_set = set(subdf['To'].dropna())
    top_values = to_set - from_set
    return top_values

df_top = df.groupby('id_ego').apply(get_top_values, include_groups=False).reset_index()
df_top.columns = ['id_ego', 'top_values']
df_top = pd.DataFrame(df_top)

In [11]:
df_top.head()

Unnamed: 0,id_ego,top_values
0,4000.0,{happiness}
1,4001.0,{stability}
2,4002.0,"{joy, achievement}"
3,4003.0,{inner peace}
4,4004.0,{happiness}


# Embeddings
**Output**
- embedding_dict (Dict)
    - value_embeddings
    - concept_embeddings
- embedding_df (DataFrame)

In [38]:
# Pre-defined  10 concepts
all_concepts = [
    'accuracy', 'speed',
    'vigilance', 'optimism',
    'persistence', 'change',
    'detailed', 'holistic',
    'analytic', 'creative'
]

# Organize all values
values = pd.concat([df['From'], df['To']]).dropna().unique()
all_values = list(set(values))

# Combine all values and concepts
all_phrases = list(set(all_values + all_concepts))

# Embed all values and concepts
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(all_phrases, convert_to_tensor=True)

# Create a dictionary of all values and concepts
# Separate embeddings into value and concept embeddings
embedding_dict = dict(zip(all_phrases, embeddings))
value_embeddings = {k: v for k, v in embedding_dict.items() if k in all_values}
concept_embeddings = {k: v for k, v in embedding_dict.items() if k in all_concepts}

# Create a dataframe of all values and concepts
embedding_df = pd.DataFrame(embeddings.numpy(), index=all_phrases).reset_index()
embedding_df = embedding_df.rename(columns={'index': 'phrase'})

In [39]:
embedding_df.head()

Unnamed: 0,phrase,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,delayed gratification,-0.012344,-0.005913,0.014359,0.021533,0.061565,0.056946,0.112863,0.001354,0.078411,...,0.0215,-0.044412,0.017278,-0.010454,-0.003615,0.022982,0.042663,-0.058685,-0.006582,-0.0067
1,financial,0.02163,0.054482,-0.065966,0.050728,-0.017508,-0.013171,0.12033,0.051142,0.072215,...,0.00187,-0.006667,-0.041264,0.039454,-0.087942,-0.042742,0.110148,-0.087574,0.042817,-0.009624
2,culture,0.025909,0.074448,-0.059861,0.007025,-0.03074,-0.016011,0.083088,-0.079464,0.022216,...,0.031799,-0.064812,0.032322,0.058966,0.021195,0.040196,0.085371,0.072337,0.065646,-0.018934
3,"family , friends & relationships",-0.070513,0.07679,0.003241,0.029457,-0.04677,0.037048,0.041444,-3.3e-05,0.059619,...,0.034651,0.032431,0.024282,0.060002,0.028608,-0.041868,0.121541,0.039114,0.020478,0.003897
4,trusty,-0.092181,-0.029805,-0.038103,-0.008671,-0.01103,-0.064135,0.088889,-0.045214,0.037013,...,0.02159,-0.010783,-0.015152,0.027539,-0.019465,0.03365,0.076907,0.020782,0.04558,-0.044809


# Cosine_distance
**Output**
- distance_df (DataFrame): value - concept pairs

In [40]:
# Convert dicts to matrix form
value_matrix = np.vstack([value_embeddings[k] for k in all_values])
concept_matrix = np.vstack([concept_embeddings[k] for k in all_concepts])

# Compute cosine distances
distance_matrix = cosine_distances(value_matrix, concept_matrix)

# Create DataFrame with distances (rows = values, columns = concepts)
distance_df = pd.DataFrame(distance_matrix, columns=all_concepts)
distance_df['value'] = all_values

cols = ['value'] + all_concepts
distance_df = distance_df[cols]

In [41]:
distance_df.head()

Unnamed: 0,value,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,delayed gratification,0.916968,0.822488,0.656986,0.592572,0.740763,0.831818,0.789672,0.881745,0.846951,0.798913
1,financial,0.787531,0.648659,0.879571,0.695945,0.703519,0.718805,0.719594,0.887873,0.83224,0.758707
2,culture,0.792015,0.742714,0.791626,0.732219,0.690387,0.683822,0.706797,0.665466,0.776402,0.736213
3,trusty,0.712684,0.789946,0.735875,0.663073,0.791247,0.762146,0.704637,0.797252,0.922574,0.698305
4,"family , friends & relationships",0.910645,0.782363,0.832291,0.79254,0.711107,0.780303,0.757012,0.782036,0.911081,0.834835


# Cosine_score (w/ vector)
**Output**
- direction_score_dict (Dict): value - concept vector pairs
- direction_score_df (DataFrame)

In [44]:
# Define direction vectors
direction_defs = {
    'speed': ('accuracy', 'speed'),
    'optimism': ('vigilance', 'optimism'),
    'change': ('persistence', 'change'),
    'holistic': ('detailed', 'holistic'),
    'creative': ('analytic', 'creative'),
}

# Construct direction vectors: direction(dim) = pos - neg
direction_vectors = {
    dim: embedding_dict[pos] - embedding_dict[neg]
    for dim, (neg, pos) in direction_defs.items()
}

In [83]:
# Initialize a dict to store distances per value
direction_score_dict = {}

# Compute cosine distance to each psychological direction
for val in all_values:
    val_vec = embedding_dict[val]  # Get the embedding vector of the value
    
    # For this value, compute its distance to each dimension
    distances = {}
    for dim, dir_vec in direction_vectors.items():
        distance = cosine_distances([val_vec], [dir_vec])[0][0]
        distances[dim] = distance  # Store under dimension name
    
    direction_score_dict[val] = distances

# Convert the dict to a DataFrame
direction_score_df = pd.DataFrame(direction_score_dict).T.rename_axis('value').reset_index()

In [82]:
direction_score_df.head()

Unnamed: 0,value,speed,optimism,change,holistic,creative
0,delayed gratification,0.912176,0.941448,1.076213,1.073691,0.961719
1,financial,0.870911,0.833084,1.012794,1.134683,0.941402
2,culture,0.954172,0.945998,0.994505,0.966921,0.967974
3,trusty,1.071819,0.933822,0.975642,1.074125,0.821282
4,"family , friends & relationships",0.880755,0.963866,1.057917,1.020028,0.93924


# Individual_Score
**Output**
- individual_score_df (DataFrame): Individual average values in five directions

In [84]:
# Prepare result list
individual_scores = []

# For each individual, aggregate scores across their values reported
# indivdual_values is defined as a dict of {id_ego: [values]} in the previous section
for (pid, name), values in individual_values.items():
    score_sum = {'speed': 0, 'optimism': 0, 'change': 0, 'holistic': 0, 'creative': 0}
    count = 0
    
    for val in values:
        if val in direction_score_dict:
            for dim in score_sum:
                score_sum[dim] += direction_score_dict[val][dim]
            count += 1
    
    # Compute average if count > 0
    if count > 0:
        avg_scores = {dim: score_sum[dim]/count for dim in score_sum}
    else:
        avg_scores = {dim: None for dim in score_sum}  # or use np.nan

    # Add ID and name
    avg_scores['id'] = pid
    avg_scores['name'] = name
    individual_scores.append(avg_scores)

# Step 4: Convert to DataFrame
individual_score_df = pd.DataFrame(individual_scores)

# Reorder columns
cols = ['id', 'name', 'speed', 'optimism', 'change', 'holistic', 'creative']
individual_score_df = individual_score_df[cols]


In [94]:
individual_score_df.head()

Unnamed: 0,id,name,speed,optimism,change,holistic,creative
0,7023.0,Josh Adler,0.920541,1.005041,1.012527,1.021717,0.936188
1,7053.0,Mohit Agarwal,0.952482,0.956239,0.986481,1.023192,0.908079
2,7051.0,Jamie Bergos,0.948666,1.008218,1.066455,1.032304,0.932637
3,7059.0,Jeremy Browning,0.970015,0.995761,1.012033,1.054411,0.921182
4,7059.0,Jeremy Browni+D372,0.982139,0.963243,1.037261,1.082329,0.903842


# Individual Distance
**Output**
- individual_10_distance_df (DataFrame): Individual all values, distance to 10 concepts
- individual_avg_distance_df (DataFrame): Individual average all values, distance to 10 concepts

## Granular Level

In [92]:
# Reshape the distance_df to long format
value_concept_wide = distance_df.melt(id_vars='value', 
                                            var_name='concept', 
                                            value_name='distance')

# Convert the dict to a DataFrame with columns ['id', 'name', 'value']
individual_value_df = pd.DataFrame([
    {'id': pid, 'name': name, 'value': val}
    for (pid, name), values in individual_values.items()
    for val in values
])

# Merge to get distance per value per individual
granular_df = individual_value_df.merge(value_concept_wide, on='value', how='left')

# Pivot into wide format (concepts as columns)
individual_10_distance_df = granular_df.pivot_table(
    index=['id', 'name', 'value'],
    columns='concept',
    values='distance'
).reset_index()
individual_10_distance_df.columns.name = None

In [93]:

individual_10_distance_df.head()

Unnamed: 0,id,name,value,accuracy,analytic,change,creative,detailed,holistic,optimism,persistence,speed,vigilance
0,10001.0,Sevinj Abdullayeva,appreciation,0.758332,0.737472,0.682074,0.677993,0.737746,0.775258,0.619869,0.828005,0.740392,0.648256
1,10001.0,Sevinj Abdullayeva,balance,0.73153,0.774812,0.703486,0.74984,0.781066,0.793019,0.691257,0.678201,0.730251,0.718066
2,10001.0,Sevinj Abdullayeva,family,0.776483,0.869543,0.712907,0.741106,0.703903,0.803564,0.749274,0.712656,0.675549,0.824004
3,10001.0,Sevinj Abdullayeva,happiness,0.878913,0.726232,0.708003,0.631593,0.723332,0.730679,0.460599,0.610629,0.679593,0.678123
4,10001.0,Sevinj Abdullayeva,peace,0.802051,0.816398,0.666284,0.641782,0.675184,0.72838,0.624267,0.735466,0.683757,0.625415


## Average Level

In [95]:
# Group by 'name' and compute mean of each concept column
individual_avg_distance_df = individual_10_distance_df.groupby('name')[all_concepts].mean().reset_index()

# Keep one id per name (if available)
id_lookup = individual_10_distance_df[['name', 'id']].drop_duplicates()

# Merge back into the final profile
individual_avg_distance_df = id_lookup.merge(individual_avg_distance_df, on='name')


In [96]:
individual_avg_distance_df.head()

Unnamed: 0,name,id,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,Sevinj Abdullayeva,10001.0,0.777084,0.717548,0.679767,0.62124,0.695295,0.721323,0.734092,0.771458,0.803635,0.712655
1,Elaff Abudawood,10002.0,0.822417,0.781983,0.76466,0.701804,0.742333,0.757479,0.756653,0.813099,0.866933,0.755664
2,Dilbar Ahmadli,10003.0,0.770278,0.733613,0.756028,0.706251,0.666908,0.722812,0.721563,0.804439,0.795864,0.753052
3,Felipe Alvarez,10005.0,0.785766,0.745513,0.701281,0.686291,0.710268,0.743212,0.742882,0.777441,0.821545,0.762805
4,Tatika Catipay,10008.0,0.773343,0.729919,0.782416,0.732539,0.7282,0.700765,0.70776,0.766011,0.79348,0.742489
