In [1]:
import pandas as pd
import os

directory_path = '../data/transcripts_from_whisper/'

transcripts_df = pd.DataFrame(columns=["id", "text"])

for filename in os.listdir(directory_path):
    if filename.endswith(".txt"):
        file_id = filename[:3]
        with open(os.path.join(directory_path, filename), "r") as file:
            file_contents = file.read()
        transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)

transcripts_df["id"] = transcripts_df["id"].astype("int64")
print(transcripts_df.head())

labels_dev_df = pd.read_csv('../data/labels/dev_split.csv')
labels_dev_df.columns = ['id', 'Gender', 'PHQ_Binary', 'PHQ_Score', 'PCL-C (PTSD)', 'PTSD Severity']

labels_train_df = pd.read_csv('../data/labels/train_split.csv')
labels_train_df.columns = ['id', 'Gender', 'PHQ_Binary', 'PHQ_Score', 'PCL-C (PTSD)', 'PTSD Severity']

labels_test_df = pd.read_csv('../data/labels/test_split.csv')
labels_test_df.columns = ['id', 'Gender', 'PHQ_Binary', 'PHQ_Score', 'PCL-C (PTSD)', 'PTSD Severity']

# merge dataframes on id
df_dev = pd.merge(labels_dev_df, transcripts_df, on='id')
df_train = pd.merge(labels_train_df, transcripts_df, on='id')
df_test = pd.merge(labels_test_df, transcripts_df, on='id')

df_dev = df_dev.sort_values(by="id")
df_train = df_train.sort_values(by="id")
df_test = df_test.sort_values(by="id")

df_dev.head()

    id                                               text
0  322   I'm going to put the doorbell right next to y...
1  710   I think we're good so far. One quick thing, i...
2  602   This is super neat. I like this. Me either at...
3  352   IA leaflet? Hi, how are you? My name is Kim. ...
4  415   Okay? Hi, I'm Ellen. Thanks for coming in tod...


  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)
  transcripts_df = transcripts_df.append({"id": file_id, "text": file_contents}, ignore_index=True)


Unnamed: 0,id,Gender,PHQ_Binary,PHQ_Score,PCL-C (PTSD),PTSD Severity,text
0,300,male,0,2,0,25,which will record your body. So I'll show you...
1,301,male,0,3,0,17,"Yeah, there's all sorts of different studies ..."
2,306,female,0,0,0,21,"Okay, looks like we're good. But let's move a..."
3,317,male,0,8,1,51,Okay. How long is this? This is probably goin...
4,320,female,0,11,1,64,"Okay, everything looks good. Okay. Perfect. O..."


In [24]:
df_train = pd.read_csv('df_train_prompt_concise.csv')
df_dev = pd.read_csv('df_dev_prompt_concise.csv')
df_test = pd.read_csv('df_test_prompt_concise.csv')
df_test.head()

Unnamed: 0,id,Gender,PHQ_Binary,PHQ_Score,PCL-C (PTSD),PTSD Severity,text,completions
0,600,female,0,5,0,23.0,"Alright, there you are. Perfect. So you're go...",Key linguistic features indicative of depressi...
1,602,female,1,13,1,67.0,This is super neat. I like this. Me either at...,Some potential linguistic features indicative ...
2,604,male,1,12,0,30.0,"So if you could just say 1, 2, 3, 4, 5. 1, 2,...","Based on the text provided, it is difficult to..."
3,605,male,0,2,0,23.0,I'm going to bring up our virtual human for a...,"The interviewee appears to be cooperative, eng..."
4,606,female,0,5,0,46.0,"Okay, I just got it. I just got it new. So, o...",The key linguistic features indicative of depr...


In [25]:
import nltk
import numpy as np
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')

# Preprocess the text data
def preprocess_text(text):
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text.lower())

    # Remove punctuation and stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]

    return tokens

# Train the Word2Vec model
def train_word2vec(texts, vector_size=100, window=5, min_count=1, workers=4):
    # Preprocess the text data
    preprocessed_text = [preprocess_text(text) for text in texts]

    # Train the Word2Vec model
    model = Word2Vec(preprocessed_text, vector_size=vector_size, window=window, min_count=min_count, workers=workers)

    return model

# Extract Word2Vec features for each text
def extract_word2vec_features(texts, model):
    features = []
    for text in texts:
        # Preprocess the text
        tokens = preprocess_text(text)

        # Initialize a feature vector
        feature_vector = np.zeros((model.vector_size,))

        # Compute the mean of the word vectors in the text
        for token in tokens:
            if token in model.wv:
                feature_vector += model.wv[token]
        feature_vector /= len(tokens)

        features.append(feature_vector)

    return np.array(features)


[nltk_data] Downloading package punkt to /Users/misha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/misha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
df_train['completions'] = df_train['completions'].astype(str)

# Train the Word2Vec model
model = train_word2vec(df_train['completions'])

# Extract Word2Vec features for each text
features_train = extract_word2vec_features(df_train['completions'], model)


In [28]:
features_train.shape

(163, 100)

In [29]:
df_dev['completions'] = df_dev['completions'].astype(str)

# Train the Word2Vec model
model = train_word2vec(df_dev['completions'])

# Extract Word2Vec features for each text
features_dev = extract_word2vec_features(df_dev['completions'], model)

features_dev.shape

(56, 100)

In [31]:
df_test['completions'] = df_test['completions'].astype(str)

# Train the Word2Vec model
model = train_word2vec(df_test['completions'])

# Extract Word2Vec features for each text
features_test = extract_word2vec_features(df_test['completions'], model)

features_test.shape

(56, 100)

In [32]:
# train a SVR model on the bert_features and PHQ_Score as the target
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

X_train = np.array(features_train)
X_dev = np.array(features_dev)
X_train = np.concatenate((X_train, X_dev), axis=0)

X_test = np.array(features_test)

y_train = np.array(df_train['PHQ_Score'])
y_dev = np.array(df_dev['PHQ_Score'])
y_train = np.concatenate((y_train, y_dev), axis=0)

y_test = np.array(df_test['PHQ_Score'])

# train a SVR model on X_train and y_train
svr = SVR(kernel='rbf', C=3.5, gamma=0.1)
svr.fit(X_train, y_train)

# predict on X_train and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
print('rmse for train: ', np.sqrt(mse))
print('mae for train: ', mae)

# predict on X_dev and calculate the mean squared error and mean absolute error
# y_pred = svr.predict(X_dev)
# mse = mean_squared_error(y_dev, y_pred)
# mae = mean_absolute_error(y_dev, y_pred)
# print('rmse for dev: ', np.sqrt(mse))
# print('mae for dev: ', mae)

# predict on X_dev and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('rmse for test: ', np.sqrt(mse))
print('mae for test: ', mae)

rmse for train:  6.188883020858026
mae for train:  4.883945462236666
rmse for test:  7.0963441096254565
mae for test:  5.623394984967051


In [33]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define the parameter grid to search over
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 5, 10, 100],
    'kernel': ['linear', 'rbf']
}

# Create a support vector regression object
svr = SVR()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
print("RMSE Train:", rmse)
print("MAE Train:", mae)

# Make predictions on the testing set using the best model
y_pred = grid_search.predict(X_test)

# Calculate the root mean squared error and mean absolute error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("RMSE Test:", rmse)
print("MAE Test:", mae)


Best hyperparameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
RMSE Train: 6.181224094140514
MAE Train: 4.885384133298123
RMSE Test: 7.0567231475668235
MAE Test: 5.6071905864383025
