In [41]:
!pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [43]:
import os
import random
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Tokenizer for sentence summary
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

In [44]:
# sentence transformer for similarity score
sentences = ["I am happy", "The world is so ugly"]
model1 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_1= model1.encode(sentences[0])
embedding_2 = model1.encode(sentences[1])

similarities = model1.similarity(embedding_1, embedding_2)
print(f"{similarities[0][0]:.4f}")

0.1630


In [45]:
# Cross encoder for determing 反对\中立\支持
model2 = CrossEncoder('cross-encoder/nli-distilroberta-base')
score = model2.predict([('I am sad', 'I am sooo happy')])
print(score)

[[ 4.6733823 -2.8701892 -2.367224 ]]


# Load data (String manipulation)

In [48]:
test_data = pd.read_csv('test_revise4.csv')

In [52]:
questions = {
 'q1': 'Please share all of the reasons you chose to attend a TAPS seminar.',
 'q2': 'How did you find out about the TAPS seminar?',
 'q3': 'This seminar helped me to feel socially connected',
 'q4': 'This seminar taught me new ways to cope with my grief',
 'q5': 'This seminar helped me to better understand my grief',
 'q6': 'This seminar has given me hope for the future',
 'q7': 'Based on your experience at the TAPS seminar, how likely are you to continue to connect with TAPS virtually / at in-person programs?',
 'q8': 'Based on your experience at the TAPS seminar, how likely are you to continue to bring your child(ren) to connect with TAPS at in-person programs?',
 'q9': 'Rate your level of agreement: I have a positive outlook toward life',
 'q10': 'Rate your level of agreement: I have short and/or long range goals',
 'q11': 'Rate your level of agreement: I feel all alone',
 'q12': 'Rate your level of agreement: I can see possibilities in the midst of difficulties',
 'q13': 'Rate your level of agreement: I have faith that gives me comfort',
 'q14': 'Rate your level of agreement: I feel scared about my future',
 'q15': 'Rate your level of agreement: I can recall happy/joyful times',
 'q16': 'Rate your level of agreement: I have deep inner strength',
 'q17': 'Rate your level of agreement: I am able to give and receive care/love',
 'q18': 'Rate your level of agreement: I have a sense of direction',
 'q19': 'Rate your level of agreement: I believe that each day has potential',
 'q20': 'Rate your level of agreement: I feel my life has value and worth',
 'q21': 'Please share with TAPS your favorite moment of the weekend? Did you have a breakthrough moment this weekend you would like to share?',
 'q22': 'Please share any additional feedback you have regarding your TAPS Seminar experience.'
}

In [54]:
rating_reference_1 = {
    0: "Extremely likely / Very likely",
    1: "Somewhat likely",
    2: "Neither likely nor unlikely / Neutral",
    3: "Somewhat unlikely",
    4: "Extremely unlikely / Not at all likely"
}

rating_reference_2 = {
    0: "Strongly Disagree",
    1: "Disagree",
    2: "Agree",
    3: "Strongly Agree"
}

rating_reference_3 = {
    0:'Disagree',
    1:'Neither agree nor disagree',
    2:'Agree',
}


In [56]:
# Create a mapping of questions to rating references
question_rating_map = {}

# Likelihood-based questions (using rating_reference_1)
likelihood_questions = ['q7', 'q8']
for q in likelihood_questions:
    question_rating_map[q] = rating_reference_1

# Agreement-based questions (using rating_reference_2)
agreement_questions = [f'q{i}' for i in range(3, 21)]
for q in agreement_questions:
    question_rating_map[q] = rating_reference_2

# Open-ended questions (no rating reference)
open_ended_questions = ['q1', 'q2', 'q21', 'q22']
for q in open_ended_questions:
    question_rating_map[q] = 'Open-ended'

# Display the mapping
#for q, ref in question_rating_map.items():
#    print(f"{questions[q]}: {ref}")

In [58]:
journey_map = {
    "Immediate Grief, Shock & Emotion": {
        "number": 1,
        "description": "Overwhelmed, loss of purpose; shock and trauma emotions (isolation) present and challenging to understand. Individuals may struggle to deal with family responsibilities alone. Surviving Child: Feeling disconnected without guidance and attention from grieving adults.",
        "support_elements": "Mission critical. 24/7 National Military Survivor Helpline. Ongoing Survivor Care Team."
    },
    "Navigating Family Relationships": {
        "number": 2,
        "description": "Experiencing tension between individuals within the family unit; lack of support from family members. Surviving Family Unit: Perception of other family members’ grief experience. Each family member may be at different phases of their grief journey.",
        "support_elements": "Guidance & Acknowledgement. Catered resources."
    },
    "Learning to Process Grief": {
        "number": 3,
        "description": "Experiencing grief and learning to process those emotions. Surviving Child: Seeks guidance and acknowledgment of grief; benefit from opportunities to open up and process with kids in similar situations to normalize emotions.",
        "support_elements": "Community support. Safe space to process emotions."
    },
    "Moments That Matter": {
        "number": 4,
        "description": "Renewed experience of grief around anniversaries of loss, holidays, and special moments. Surviving Family Unit: Navigating special moments (sports, school achievements, moments that matter).",
        "support_elements": "Continued grief support."
    },
    "Feeling Immersed, Connected & Seen": {
        "number": 5,
        "description": "Finding new purpose and goals to begin moving towards Positive Integration. Surviving Family Unit: Connected to a broader community; support system; not the only person/family experiencing loss.",
        "support_elements": "Maintain community. Safe space to process emotions."
    },
    "New Growth & Purpose": {
        "number": 6,
        "description": "Healthy point in grief journey; feeling capable to help others and a desire to do so. Surviving Family Unit: Ready to give back to the TAPS community through mentorship programs, volunteering at charity drives & events, etc.",
        "support_elements": "Maintain family connection/healing. Desire to support others."
    }
}

In [60]:
categories = []
for stage in journey_map:
  categories.append(f"{stage}:" + journey_map[stage]['description'])
print(categories)

['Immediate Grief, Shock & Emotion:Overwhelmed, loss of purpose; shock and trauma emotions (isolation) present and challenging to understand. Individuals may struggle to deal with family responsibilities alone. Surviving Child: Feeling disconnected without guidance and attention from grieving adults.', 'Navigating Family Relationships:Experiencing tension between individuals within the family unit; lack of support from family members. Surviving Family Unit: Perception of other family members’ grief experience. Each family member may be at different phases of their grief journey.', 'Learning to Process Grief:Experiencing grief and learning to process those emotions. Surviving Child: Seeks guidance and acknowledgment of grief; benefit from opportunities to open up and process with kids in similar situations to normalize emotions.', 'Moments That Matter:Renewed experience of grief around anniversaries of loss, holidays, and special moments. Surviving Family Unit: Navigating special moment

# Example - Inference with Cross Encoder

In [63]:
# Initialize DataFrame
submission = pd.DataFrame(columns=['Id', 'Category', 'Sim_Score'])
Specific_ids= []
ids = []
category_nums = []
sim_scores = []

# Iterate over survey data
for survey_num in range(0, len(test_data)):
    answer = ""
    
    # Process different groups of questions
    for i in range(3, 7):
        key = f'q{i}'
        if key in questions and key in test_data.columns:
            try:
                value = test_data.loc[survey_num][key]
                if pd.notna(value):  # Check for NaN
                    answer += questions[key].split('agreement: ')[-1] + ": " + rating_reference_3[value] + "\n"
            except KeyError:
                print(f"KeyError at question {key} for survey {survey_num}")
                continue

    for i in range(7, 9):
        key = f'q{i}'
        if key in questions and key in test_data.columns:
            try:
                value = test_data.loc[survey_num][key]
                if pd.notna(value):
                    answer += questions[key].split('agreement: ')[-1] + ": " + rating_reference_1[value] + "\n"
            except KeyError:
                print(f"KeyError at question {key} for survey {survey_num}")
                continue

    for i in range(9, 21):
        key = f'q{i}'
        if key in questions and key in test_data.columns:
            try:
                value = test_data.loc[survey_num][key]
                if pd.notna(value):
                    answer += questions[key].split('agreement: ')[-1] + ": " + rating_reference_2[value] + "\n"
            except KeyError:
                print(f"KeyError at question {key} for survey {survey_num}")
                continue

    for i in range(21, 23):
        key = f'q{i}'
        if key in questions and key in test_data.columns:
            try:
                value = test_data.loc[survey_num][key]
                if pd.notna(value):
                    answer += questions[key] + ": " + str(value)
            except KeyError:
                print(f"KeyError at question {key} for survey {survey_num}")
                continue

    # Finalize the answer string
    result = answer.strip()
    ARTICLE_TO_SUMMARIZE = result

    # Tokenize input
    inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, truncation=True, return_tensors="pt")

    # Generate Summary
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0)
    result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    # Predict similarity score for each category
    for index, category in enumerate(categories):
        score = model2.predict([(result, category)])
        sim_score = score[0][2]  # Get the similarity score
        
        # Store results in lists
        Specific_ids.append(test_data.loc[survey_num]["Id"])
        ids.append(survey_num)
        category_nums.append(index + 1)  # Convert category to 1-6
        sim_scores.append(sim_score)

# Add results to DataFrame
submission['Specific_ids'] = Specific_ids
submission['Id'] = ids
submission['Category'] = category_nums
submission['Sim_Score'] = sim_scores

# Display the DataFrame
submission.head(15)


Unnamed: 0,Id,Category,Sim_Score,Specific_ids
0,0,1,1.426547,a2c83973-8e81-45c9-bb84-37fa8a8b637c
1,0,2,1.279136,a2c83973-8e81-45c9-bb84-37fa8a8b637c
2,0,3,1.441667,a2c83973-8e81-45c9-bb84-37fa8a8b637c
3,0,4,2.100202,a2c83973-8e81-45c9-bb84-37fa8a8b637c
4,0,5,1.35185,a2c83973-8e81-45c9-bb84-37fa8a8b637c
5,0,6,1.816162,a2c83973-8e81-45c9-bb84-37fa8a8b637c
6,1,1,1.001807,78bca2d4-8824-45ed-80c9-72ef0e4389c6
7,1,2,0.504863,78bca2d4-8824-45ed-80c9-72ef0e4389c6
8,1,3,0.720654,78bca2d4-8824-45ed-80c9-72ef0e4389c6
9,1,4,0.881815,78bca2d4-8824-45ed-80c9-72ef0e4389c6


In [64]:
# Ensure 'Category' column exists
if 'Category' in submission.columns:
    # Create 6 new columns, one for each category's similarity score
    submission_pivot = submission.pivot_table(
        index='Id',
        columns='Category',
        values='Sim_Score',
        aggfunc='first'
    )

    # Rename columns from 1-6 to 'Category_1' to 'Category_6'
    submission_pivot.columns = [f'Category_{int(col)}' for col in submission_pivot.columns]

    # Reset index to make 'Id' a normal column
    submission_pivot.reset_index(inplace=True)

    # Save to CSV
    submission_pivot.to_csv('updated_submission_pivot.csv', index=False)

    # Display the updated DataFrame
    print(submission_pivot.head())
else:
    print("The 'Category' column is not available in the DataFrame.")

   Id  Category_1  Category_2  Category_3  Category_4  Category_5  Category_6
0   0    1.426547    1.279136    1.441667    2.100202    1.351850    1.816162
1   1    1.001807    0.504863    0.720654    0.881815    0.933611    0.621461
2   2    1.587107    1.585799    0.901668    1.502268    1.113635    1.397816
3   3    1.884995    2.350067    1.511173    2.449934    1.801809    1.623226
4   4    1.643568    1.332253    1.704573    1.719702    1.823390    1.604861


In [65]:
submission.to_csv('submission.csv', index=False)