In [2]:
# pip install -U sentence-transformers

In [3]:
# pip install tf-keras

In [4]:
# pip install accelerate

In [5]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from accelerate import init_empty_weights

2025-04-09 01:45:01.155855: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
df = pd.read_excel('value structure 2022-2024.xlsx')

In [7]:
df.head()

Unnamed: 0,id_ego,Person Name,From,To,NoMeaningfulEdges,Operations,Data
0,7023,Josh Adler,strength,ecstasy,1.0,,2022 A
1,7023,Josh Adler,strength,discovery,1.0,,2022 A
2,7023,Josh Adler,strength,bravery,1.0,,2022 A
3,7023,Josh Adler,strength,calm,1.0,,2022 A
4,7023,Josh Adler,calm,connected to the universe,1.0,,2022 A


# User-Values Dict
**Output**
- individual_values (Dict)

In [8]:
# Data cleaning function
def clean_text(text):
    if pd.isna(text):
        return None
    text = str(text).strip().lower()
    text = re.sub(r'[^a-zA-Z0-9\s\-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Clean From and To columns
df['From_clean'] = df['From'].apply(clean_text)
df['To_clean'] = df['To'].apply(clean_text)

In [9]:
# Build individual_values dictionary
individual_values = defaultdict(set)

for _, row in df.iterrows():
    pid = str(row['id_ego']).strip()
    name = row['Person Name'].strip()
    key = (pid, name)
    
    from_val = row['From_clean']
    to_val = row['To_clean']
    
    if from_val: individual_values[key].add(from_val)
    if to_val: individual_values[key].add(to_val)

# Convert sets to sorted lists
individual_values = {
    k: sorted(list(v)) for k, v in individual_values.items()
}

In [10]:
# Preview:
for k, v in list(individual_values.items())[:5]:
    print(f"{k}: {v}")

('7023', 'Josh Adler'): ['bravery', 'calm', 'connected to the universe', 'discovery', 'ecstasy', 'growth', 'strength', 'zen']
('7053', 'Mohit Agarwal'): ['empathy', 'family', 'gratitude', 'make things better', 'organization', 'peace', 'professionalism', 'quality']
('7051', 'Jamie Bergos'): ['beauty', 'connection to the universe', 'excellence', 'purpose', 'relationships', 'resilience', 'service', 'strong work ethic']
('7059', 'Jeremy Browning'): ['achievement', 'beauty', 'family', 'health', 'knowledge', 'legacy', 'respect', 'self awareness']
('7059', 'Jeremy Browni+D372'): ['beauty', 'knowledge']


# User Top Values
**Output**
- df_top (DataFrame)

In [11]:
# group by id_ego
def get_top_values(subdf):
    from_set = set(subdf['From'].dropna())
    to_set = set(subdf['To'].dropna())
    top_values = to_set - from_set
    return top_values

df_top = df.groupby('id_ego').apply(get_top_values, include_groups=False).reset_index()
df_top.columns = ['id_ego', 'top_values']
df_top = pd.DataFrame(df_top)

In [12]:
df_top.head()

Unnamed: 0,id_ego,top_values
0,0,"{growth, serenity}"
1,4000,{happiness}
2,4001,{stability}
3,4002,"{joy, achievement}"
4,4003,{inner peace}


# Embeddings
**Output**
- embedding_dict (Dict)
    - value_embeddings
    - concept_embeddings
- embedding_df (DataFrame)

In [13]:
# Pre-defined  10 concepts
all_concepts = [
    'accuracy', 'speed',
    'vigilance', 'optimism',
    'persistence', 'change',
    'detailed', 'holistic',
    'analytic', 'creative'
]

# Organize all values
values = pd.concat([df['From'], df['To']]).dropna().unique()
all_values = list(set(values))

# Combine all values and concepts
all_phrases = list(set(all_values + all_concepts))

# Embed all values and concepts
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(all_phrases, convert_to_tensor=True)

# Create a dictionary of all values and concepts
# Separate embeddings into value and concept embeddings
embedding_dict = dict(zip(all_phrases, embeddings))
value_embeddings = {k: v for k, v in embedding_dict.items() if k in all_values}
concept_embeddings = {k: v for k, v in embedding_dict.items() if k in all_concepts}

# Create a dataframe of all values and concepts
embedding_df = pd.DataFrame(embeddings.numpy(), index=all_phrases).reset_index()
embedding_df = embedding_df.rename(columns={'index': 'phrase'})

In [14]:
embedding_df.head()

Unnamed: 0,phrase,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,peace of mind,-0.011453,0.070701,-0.0326,-0.00363,0.041072,0.018677,0.176774,-0.073371,0.107859,...,0.03388,0.031685,-0.013952,0.035584,-0.043882,0.03384,0.112274,-0.022876,-0.017094,0.032268
1,moral beauty,0.007168,0.029288,-0.01486,0.008945,-0.077046,-0.00274,0.04724,-0.079291,0.01612,...,-0.012034,-0.093161,0.016545,0.096268,-0.024356,0.04593,0.120234,-0.032516,0.041645,-0.07681
2,excel,-0.005644,0.015732,-0.11616,-0.009402,-0.058412,0.001313,-0.06467,0.050269,0.07175,...,-0.031081,0.042749,-0.019672,-0.004365,0.039386,-0.012806,0.070605,0.036097,0.021164,-0.042385
3,accpetance,-0.036075,-0.030766,0.000713,0.02555,-0.062814,0.001612,0.112981,-0.043054,-0.044765,...,0.097361,-0.010808,0.003573,-0.049972,-0.042772,0.030039,0.06663,-0.025752,0.04735,-0.068449
4,ambition,0.012745,0.135562,-0.009135,0.002142,-0.030367,-0.013612,0.083138,0.01385,-0.030092,...,0.050119,0.004349,0.031667,0.005181,-0.093863,0.012903,0.185886,0.026134,-0.010723,-0.052067


# Cosine_distance
**Output**
- distance_df (DataFrame): value - concept pairs

In [15]:
# Convert dicts to matrix form
value_matrix = np.vstack([value_embeddings[k] for k in all_values])
concept_matrix = np.vstack([concept_embeddings[k] for k in all_concepts])

# Compute cosine distances
distance_matrix = cosine_distances(value_matrix, concept_matrix)

# Create DataFrame with distances (rows = values, columns = concepts)
distance_df = pd.DataFrame(distance_matrix, columns=all_concepts)
distance_df['value'] = all_values

cols = ['value'] + all_concepts
distance_df = distance_df[cols]

In [16]:
distance_df.head()

Unnamed: 0,value,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,peace of mind,0.853597,0.753826,0.54378,0.620925,0.752342,0.814947,0.776602,0.608569,0.834659,0.676036
1,moral beauty,0.86355,0.816421,0.696952,0.724049,0.760694,0.861727,0.741287,0.744934,0.733806,0.66432
2,excel,0.809849,0.827289,0.935758,0.902569,0.791582,0.874224,0.847859,0.965972,0.719167,0.89285
3,accpetance,0.725547,0.73749,0.764149,0.802768,0.727325,0.789838,0.839774,0.821624,0.835077,0.827679
4,ambition,0.776569,0.718498,0.81318,0.617909,0.634189,0.723806,0.653104,0.81021,0.730107,0.718026


# Cosine_score (w/ vector)
**Output**
- direction_score_dict (Dict): value - concept vector pairs
- direction_score_df (DataFrame)

In [17]:
# Define direction vectors
direction_defs = {
    'speed': ('accuracy', 'speed'),
    'optimism': ('vigilance', 'optimism'),
    'change': ('persistence', 'change'),
    'holistic': ('detailed', 'holistic'),
    'creative': ('analytic', 'creative'),
}

# Construct direction vectors: direction(dim) = pos - neg
direction_vectors = {
    dim: embedding_dict[pos] - embedding_dict[neg]
    for dim, (neg, pos) in direction_defs.items()
}

In [18]:
# Initialize a dict to store distances per value
direction_score_dict = {}

# Compute cosine distance to each psychological direction
for val in all_values:
    val_vec = embedding_dict[val]  # Get the embedding vector of the value
    
    # For this value, compute its distance to each dimension
    distances = {}
    for dim, dir_vec in direction_vectors.items():
        distance = cosine_distances([val_vec], [dir_vec])[0][0]
        distances[dim] = distance  # Store under dimension name
    
    direction_score_dict[val] = distances

# Convert the dict to a DataFrame
direction_score_df = pd.DataFrame(direction_score_dict).T.rename_axis('value').reset_index()

In [19]:
direction_score_df.head()

Unnamed: 0,value,speed,optimism,change,holistic,creative
0,peace of mind,0.907258,1.070125,1.0524,0.865515,0.873595
1,moral beauty,0.956191,1.024631,1.084566,1.002918,0.944627
2,excel,1.016212,0.969831,1.069171,1.094532,1.138406
3,accpetance,1.011101,1.035105,1.052323,0.985473,0.994105
4,ambition,0.94602,0.822499,1.075009,1.125739,0.990373


# Individual_Score
**Output**
- individual_score_df (DataFrame): Individual average values in five directions

In [20]:
# Prepare result list
individual_scores = []

# For each individual, aggregate scores across their values reported
# indivdual_values is defined as a dict of {id_ego: [values]} in the previous section
for (pid, name), values in individual_values.items():
    score_sum = {'speed': 0, 'optimism': 0, 'change': 0, 'holistic': 0, 'creative': 0}
    count = 0
    
    for val in values:
        if val in direction_score_dict:
            for dim in score_sum:
                score_sum[dim] += direction_score_dict[val][dim]
            count += 1
    
    # Compute average if count > 0
    if count > 0:
        avg_scores = {dim: score_sum[dim]/count for dim in score_sum}
    else:
        avg_scores = {dim: None for dim in score_sum}  # or use np.nan

    # Add ID and name
    avg_scores['id_ego'] = pid
    avg_scores['name'] = name
    individual_scores.append(avg_scores)

# Step 4: Convert to DataFrame
individual_score_df = pd.DataFrame(individual_scores)

# Reorder columns
cols = ['id_ego', 'name', 'speed', 'optimism', 'change', 'holistic', 'creative']
individual_score_df = individual_score_df[cols]


In [21]:
individual_score_df.head()

Unnamed: 0,id_ego,name,speed,optimism,change,holistic,creative
0,7023,Josh Adler,0.920541,1.005041,1.012527,1.021717,0.936188
1,7053,Mohit Agarwal,0.952482,0.956239,0.986481,1.023192,0.908079
2,7051,Jamie Bergos,0.948666,1.008218,1.066455,1.032304,0.932637
3,7059,Jeremy Browning,0.970015,0.995761,1.012033,1.054411,0.921182
4,7059,Jeremy Browni+D372,0.982139,0.963243,1.037261,1.082329,0.903842


# Individual Distance
**Output**
- individual_10_distance_df (DataFrame): Individual all values, distance to 10 concepts
- individual_avg_distance_df (DataFrame): Individual average all values, distance to 10 concepts

## Granular Level

In [22]:
# Reshape the distance_df to long format
value_concept_wide = distance_df.melt(id_vars='value', 
                                            var_name='concept', 
                                            value_name='distance')

# Convert the dict to a DataFrame with columns ['id', 'name', 'value']
individual_value_df = pd.DataFrame([
    {'id_ego': pid, 'name': name, 'value': val}
    for (pid, name), values in individual_values.items()
    for val in values
])

# Merge to get distance per value per individual
granular_df = individual_value_df.merge(value_concept_wide, on='value', how='left')

# Pivot into wide format (concepts as columns)
individual_10_distance_df = granular_df.pivot_table(
    index=['id_ego', 'name', 'value'],
    columns='concept',
    values='distance'
).reset_index()
individual_10_distance_df.columns.name = None

In [23]:

individual_10_distance_df.head()

Unnamed: 0,id_ego,name,value,accuracy,analytic,change,creative,detailed,holistic,optimism,persistence,speed,vigilance
0,0,Ashley Sohier,communication,0.756391,0.799477,0.743548,0.727627,0.624526,0.787728,0.748572,0.698159,0.643526,0.733189
1,0,Ashley Sohier,dependability,0.799066,0.85542,0.798608,0.827389,0.78208,0.809903,0.808237,0.622948,0.850287,0.801156
2,0,Ashley Sohier,discipline,0.8348,0.742903,0.77028,0.809655,0.723762,0.706879,0.777401,0.569645,0.751944,0.656388
3,0,Ashley Sohier,family,0.776483,0.869543,0.712907,0.741106,0.703903,0.803564,0.749274,0.712656,0.675549,0.824004
4,0,Ashley Sohier,fun,0.815959,0.859714,0.732217,0.533035,0.630445,0.87541,0.721859,0.750071,0.760564,0.810578


## Average Level

In [24]:
# Group by 'name' and compute mean of each concept column
individual_avg_distance_df = individual_10_distance_df.groupby('name')[all_concepts].mean().reset_index()

# Keep one id per name (if available)
id_lookup = individual_10_distance_df[['name', 'id_ego']].drop_duplicates()

# Merge back into the final profile
individual_avg_distance_df = id_lookup.merge(individual_avg_distance_df, on='name')

In [25]:
individual_avg_distance_df.head()

Unnamed: 0,name,id_ego,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
0,Ashley Sohier,0,0.775404,0.733958,0.758456,0.752323,0.65463,0.748879,0.706973,0.796054,0.804406,0.7494
1,Tiffany Yao,0,0.79399,0.764941,0.680559,0.667036,0.696583,0.756278,0.71163,0.778385,0.828297,0.723166
2,Sevinj Abdullayeva,10001,0.777084,0.717548,0.679767,0.62124,0.695295,0.721323,0.734092,0.771458,0.803635,0.712655
3,Elaff Abudawood,10002,0.822417,0.781983,0.76466,0.701804,0.742333,0.757479,0.756653,0.813099,0.866933,0.755664
4,Dilbar Ahmadli,10003,0.770278,0.733613,0.756028,0.706251,0.666908,0.722812,0.721563,0.804439,0.795864,0.753052


In [26]:
individual_score_df.to_excel('individual_score_df.xlsx', index=False)
individual_avg_distance_df.to_excel('individual_avg_distance_df.xlsx', index=False)
individual_10_distance_df.to_excel('individual_10_distance_df.xlsx', index=False)

# Data Construction

In [27]:
df_focus_score = pd.read_excel('Scores_with variables.xls', engine='xlrd')
df_focus_score['id_ego'] = df_focus_score['id_ego'].astype('Int64').astype(str)

In [28]:
df_focus_score

Unnamed: 0,id_ego,SelfMonitoring,Life_Satisfaction_6pt,Growth_mindset_7pt_scale,Individuation,Curiousity,Generosity,promotion,prevention,assessment,...,consumer,retail,tech,other,educ,entertain,energy,health,manuf,nodes
0,1,11.0,4.0,4.000,,,,3.000000,2.0,3.750000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
1,2,10.0,4.8,5.000,,,,4.833333,3.2,4.666667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
2,3,12.0,4.6,2.875,,,,3.333333,2.2,4.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0
3,4,10.0,5.0,4.125,,,,4.833333,5.0,4.750000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,5,10.0,5.0,4.250,,,,4.000000,2.6,3.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466,1044,,,,,,,,,,...,,,,,,,,,,
1467,6038,,,,,,,,,,...,,,,,,,,,,
1468,8025,,,,,,,,,,...,,,,,,,,,,
1469,8045,,,,,,,,,,...,,,,,,,,,,


In [29]:
# Filter out scores in 5 vectors
vector_cols = ['id_ego', 'speed', 'optimism', 'change', 'holistic', 'creative']
individual_score_subset = individual_score_df[vector_cols].copy()
# Rename columns
rename_dict = {col: col + '_vec' for col in vector_cols if col != 'id_ego'}
individual_score_subset = individual_score_subset.rename(columns=rename_dict)

individual_avg_distance_subset = individual_avg_distance_df[['id_ego'] + all_concepts]

# Merge
df_merged_1 = df_focus_score.merge(individual_score_subset, on='id_ego', how='left')
df_merged_2 = df_merged_1.merge(individual_avg_distance_subset, on='id_ego', how='left')

In [30]:
# Drop rows with missing values
df_clean = df_merged_2.dropna(how='any')

In [31]:
df_clean

Unnamed: 0,id_ego,SelfMonitoring,Life_Satisfaction_6pt,Growth_mindset_7pt_scale,Individuation,Curiousity,Generosity,promotion,prevention,assessment,...,accuracy,speed,vigilance,optimism,persistence,change,detailed,holistic,analytic,creative
535,4000,8.0,4.8,4.875,3.500000,4.714286,3.7,3.833333,2.2,3.833333,...,0.798162,0.712501,0.773717,0.709781,0.676989,0.742607,0.733054,0.784665,0.805460,0.746820
536,4001,10.0,4.6,3.875,3.583333,5.571429,4.7,3.833333,3.6,5.083333,...,0.765609,0.735687,0.753943,0.725729,0.703400,0.758907,0.728888,0.798807,0.805742,0.761606
537,4002,8.0,4.4,4.625,3.166667,4.428571,4.6,3.666667,4.0,4.333333,...,0.726533,0.713397,0.741475,0.646932,0.692746,0.717902,0.652077,0.782188,0.786868,0.698221
538,4003,2.0,4.6,4.625,4.083333,5.000000,4.9,3.166667,3.8,4.166667,...,0.763105,0.755187,0.692544,0.674107,0.696069,0.744200,0.739043,0.744519,0.810477,0.756756
539,4004,4.0,4.0,4.500,2.500000,5.142857,4.3,3.166667,2.8,4.666667,...,0.763733,0.739637,0.726256,0.670759,0.692266,0.709442,0.736116,0.753819,0.826426,0.723021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466,10619,5.0,5.0,3.125,3.166667,4.285714,2.9,4.333333,4.0,3.833333,...,0.708282,0.708376,0.778933,0.736508,0.664259,0.719440,0.675211,0.799477,0.792339,0.734821
1467,10620,10.0,4.6,4.625,3.083333,5.714286,4.8,3.333333,3.6,4.166667,...,0.769347,0.706958,0.732793,0.704205,0.669723,0.703219,0.708633,0.782233,0.786002,0.668651
1468,10621,15.0,4.2,4.625,4.250000,4.285714,5.3,3.833333,3.0,3.750000,...,0.791547,0.733122,0.726681,0.682142,0.710230,0.737874,0.695567,0.793155,0.816660,0.750929
1469,10622,10.0,2.4,6.000,3.166667,4.000000,5.7,3.500000,2.8,3.583333,...,0.763338,0.760387,0.758020,0.692742,0.681590,0.752285,0.717610,0.768335,0.829526,0.744938


In [32]:
df_clean.to_excel('df_clean.xlsx', index=False)

In [33]:
df_clean.columns

Index(['id_ego', 'SelfMonitoring', 'Life_Satisfaction_6pt',
       'Growth_mindset_7pt_scale', 'Individuation', 'Curiousity', 'Generosity',
       'promotion', 'prevention', 'assessment', 'locomotion',
       'integrated_worlds', 'female', 'black', 'asian', 'hispanic', 'native',
       'profserv', 'advmktg', 'nonprof', 'realestate', 'consulting', 'govt',
       'consumer', 'retail', 'tech', 'other', 'educ', 'entertain', 'energy',
       'health', 'manuf', 'nodes', 'speed_vec', 'optimism_vec', 'change_vec',
       'holistic_vec', 'creative_vec', 'accuracy', 'speed', 'vigilance',
       'optimism', 'persistence', 'change', 'detailed', 'holistic', 'analytic',
       'creative'],
      dtype='object')