In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Load Dataset

In [None]:
# Function to load dataset
import pandas as pd
path = "/content/drive/MyDrive/github_LEAF_LAB/survey_analysis/SMILE-College Dataset/smile-college-dataset.csv"
def load_dataset(path):
  df = pd.read_csv(path)
  return df

data_df = load_dataset(path)
data_df.head()
data_df = data_df[data_df['Validated_Labels'] != 'SKIP']
data_df = data_df.drop(columns=['Unnamed: 0'])
data_df = data_df.reset_index()
data_df = data_df.drop(columns=['index'])


In [None]:
data_df.describe()

Unnamed: 0,School,comment,Validated_Labels
count,793,793,793
unique,256,793,4
top,Arizona State,we need more wellness days and the university ...,DISSATISFIED
freq,100,1,376


In [None]:
data_df.shape

(793, 3)

# Limitation Analysis

In [None]:
import json
import pandas as pd

path = "/content/drive/MyDrive/github_LEAF_LAB/survey_analysis/prompt_results/SAVE_limitations_of_dissatisfied_Gpt3.5_.json"
with open(path) as f:
  data = json.load(f)

sentences = [d['list'] for d in data]

In [None]:
records = [line.split('\n') for line in sentences]

# Flatten the list of records
flattened_records = [item for sublist in records for item in sublist]

# Remove any leading whitespace
cleaned_records = [record.strip() for record in flattened_records]

In [None]:
def preprocess_records(dataset):
    cleaned_records = []
    for line in dataset:
        # Remove extra spaces from left and right
        cleaned_line = line.strip()
        # Remove leading hyphen if present
        if cleaned_line.startswith("- "):
            cleaned_line = cleaned_line[2:]
        if cleaned_line:
            cleaned_records.append(cleaned_line)
    return cleaned_records

cleaned_limitations = preprocess_records(cleaned_records)

In [None]:
len(cleaned_limitations)

646

In [1]:
!pip install sentence-transformers

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

In [None]:
from sentence_transformers import util

questions = ['Availability and Accessibility',
             'Quality of Counseling Services',
             'Diversity and Inclusivity',
             'Financial and Administrative Concerns',
             'Awareness and Education',
             'Personal Experiences and Preferences',
             'Issues with Therapist Matching',
             'Challenges in accessing the services',
             'Issues with Referrals and Redirection',
             'Inadequacies in support, communication, and community connection']

scores = []

question_embedding = model.encode(questions)

for sentence in cleaned_limitations:
  sentence_embedding = model.encode(sentence)
  cos_scores = util.dot_score(question_embedding, sentence_embedding)

  item = {'sentence': sentence}

  for i, score in enumerate(cos_scores.tolist()):
    item[questions[i]] = score[0]

  scores.append(item)

In [None]:
import pandas as pd
df = pd.DataFrame(scores)
df.head()


Unnamed: 0,sentence,Availability and Accessibility,Quality of Counseling Services,Diversity and Inclusivity,Financial and Administrative Concerns,Awareness and Education,Personal Experiences and Preferences,Issues with Therapist Matching,Challenges in accessing the services,Issues with Referrals and Redirection,"Inadequacies in support, communication, and community connection"
0,Expansion,0.190344,0.018056,0.22082,0.080844,0.130927,0.081469,0.034342,0.090436,0.033041,0.101884
1,Not enough,0.071917,0.096243,0.093832,0.196758,0.132237,0.132814,0.107378,0.067014,0.052719,0.164234
2,tired of remote,0.211137,0.128121,0.135541,0.027907,0.068293,0.140476,0.007316,0.233009,0.060083,0.1509
3,don't want it simply because it's remote,0.072037,0.04895,0.066758,-0.037097,0.049203,0.108549,-0.021899,0.086713,-0.030307,-0.008646
4,diversity needs to increase for the counselors...,0.210227,0.513994,0.558333,0.135282,0.275883,0.180867,0.374767,0.155488,0.073627,0.316402


In [None]:
numerical_columns = df.columns[1:]
numerical_columns

Index(['Availability and Accessibility', 'Quality of Counseling Services',
       'Diversity and Inclusivity', 'Financial and Administrative Concerns',
       'Awareness and Education', 'Personal Experiences and Preferences',
       'Issues with Therapist Matching',
       'Challenges in accessing the services',
       'Issues with Referrals and Redirection',
       'Inadequacies in support, communication, and community connection'],
      dtype='object')

In [None]:
top_2_max = df[numerical_columns].apply(lambda x: x.sort_values(ascending=False)[:2], axis=1)

In [None]:
df[numerical_columns] = (df[numerical_columns] == df[numerical_columns].max(axis=1).values[:, None]).astype(int)
# df[numerical_columns] = (df[numerical_columns].isin(top_2_max)).astype(int)

df


Unnamed: 0,sentence,Availability and Accessibility,Quality of Counseling Services,Diversity and Inclusivity,Financial and Administrative Concerns,Awareness and Education,Personal Experiences and Preferences,Issues with Therapist Matching,Challenges in accessing the services,Issues with Referrals and Redirection,"Inadequacies in support, communication, and community connection"
0,Expansion,0,0,1,0,0,0,0,0,0,0
1,Not enough,0,0,0,1,0,0,0,0,0,0
2,tired of remote,0,0,0,0,0,0,0,1,0,0
3,don't want it simply because it's remote,0,0,0,0,0,1,0,0,0,0
4,diversity needs to increase for the counselors...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
641,need more sessions,0,1,0,0,0,0,0,0,0,0
642,no resources,1,0,0,0,0,0,0,0,0,0
643,help students with eating disorders,0,0,0,0,1,0,0,0,0,0
644,need more wellness days,1,0,0,0,0,0,0,0,0,0


In [None]:
count_ones = df.iloc[:, 1:].sum(axis=0)

# Print the count of 1s in each column
print(count_ones)

Availability and Accessibility                                       76
Quality of Counseling Services                                      157
Diversity and Inclusivity                                            22
Financial and Administrative Concerns                                48
Awareness and Education                                              73
Personal Experiences and Preferences                                 52
Issues with Therapist Matching                                       65
Challenges in accessing the services                                 76
Issues with Referrals and Redirection                                13
Inadequacies in support, communication, and community connection     64
dtype: int64
