In [1]:
%pip install -U sentence-transformers





[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import sys
print(sys.executable)

c:\Users\manav\AppData\Local\Programs\Python\Python311\python.exe


In [3]:
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util

In [4]:
json_path = "../data/raw/initial_word_list.json"

# Load JSON
with open(json_path, "r", encoding="utf-8") as f:
    word_data = json.load(f)

# Extract into lists
hypo_words = word_data["hypo"]
hyper_words = word_data["hyper"]
flow_words = word_data["flow"]

print("Hypo words:", hypo_words)
print("Hyper words:", hyper_words)
print("Flow words:", flow_words)


Hypo words: ['numb', 'frozen', 'empty', 'heavy', 'alone', 'lonely', 'disconnected', 'hopeless', 'despair', 'invisible', 'withdrawn', 'dissociated', 'tired', 'faint', 'passive', 'foggy', 'apathetic', 'i can’t…', 'what’s the point?', 'low', 'weak', 'detached', 'spaced-out', 'slow', 'still', 'muted', 'distant', 'blank', 'vacant', 'shut down', 'fatigued', 'unmotivated', 'sluggish', 'dull', 'uninterested', 'silent', 'indifferent', 'lifeless', 'exhausted', 'unresponsive', 'powerless', 'isolated', 'lacking energy', 'collapsed', 'drained', 'flat', 'lack of will', 'checked out', "can't move", 'low-spirited', 'sad', 'unfeeling', 'quiet', 'lethargic', 'weary', 'drained', 'dull', 'downcast', 'unmotivated', 'listless', 'lifeless', 'stagnant', 'disconnectedness']
Hyper words: ['anxious', 'angry', 'panicked', 'overwhelmed', 'restless', 'tight', 'racing', 'agitated', 'frustrated', 'tense', 'unsafe', 'defensive', 'rigid', 'chaotic', 'scattered', 'i have to…', 'i’m not safe', 'something bad will happen'

In [5]:
anchor_words = {
    "hypo": ["numb", "disconnected", "hopeless"],
    "hyper": ["anxious", "agitated", "unsafe"],
    "flow": ["calm", "grounded", "safe"]
}

In [6]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to compute weight
def get_weight(word, state, is_anchor=False):
    if is_anchor:
        return 1.0
    
    # Encode anchors & target word
    anchors_vec = model.encode(anchor_words[state], convert_to_tensor=True)
    word_vec = model.encode([word], convert_to_tensor=True)
    
    # Calculate cosine similarities
    sims = util.cos_sim(word_vec, anchors_vec)[0]
    avg_sim = float(sims.mean())  # average similarity to all anchors
    
    # first scaling formula : to get weights in range 0.75 to 0.95 
    scaled_weight = 0.75 + (avg_sim * 0.20)
    return round(scaled_weight,2)
    # Problem with scaling formula 1: not enough variation in weights

    # second scaling formula : assign weight directly to the cosine similarity value
    # weight = max(0.3, avg_sim)
    # return round(weight, 2)

# Build lexicon with weights
lexicon = []
for w in hypo_words:
    lexicon.append({"word": w.lower(), "state": "hypo", "weight": get_weight(w, "hypo")})
for w in hyper_words:
    lexicon.append({"word": w.lower(), "state": "hyper", "weight": get_weight(w, "hyper")})
for w in flow_words:
    lexicon.append({"word": w.lower(), "state": "flow", "weight": get_weight(w, "flow")})

# Save to CSV
df = pd.DataFrame(lexicon)
df.to_csv("../data/processed/dataset_with_weights.csv", index=False)
print(df.head())


     word state  weight
0    numb  hypo    0.86
1  frozen  hypo    0.82
2   empty  hypo    0.83
3   heavy  hypo    0.80
4   alone  hypo    0.82


In [7]:
# Load your long-format lexicon
df = pd.read_csv("../data/processed/dataset_with_weights.csv")  # columns: word, state, weight

# Pivot into wide format
df_wide = df.pivot_table(index="word", columns="state", values="weight", fill_value=0).reset_index()

# Ensure all three columns exist
for col in ["hypo", "hyper", "flow"]:
    if col not in df_wide.columns:
        df_wide[col] = 0.0

# Reorder columns
df_wide = df_wide[["word", "hypo", "hyper", "flow"]]

# Save to new CSV
df_wide.to_csv("../data/processed/lexicon_for_sentences.csv", index=False)

print(df_wide.head())


state        word  hypo  hyper  flow
0       accepting   0.0   0.00  0.80
1       adaptable   0.0   0.00  0.79
2      aggressive   0.0   0.84  0.00
3        agitated   0.0   0.88  0.00
4         alarmed   0.0   0.86  0.00
