In [9]:
labels_dir = '../data/DAIC/labels/'

In [None]:
train_labels = pd.read_csv(labels_dir+'train_split.csv',delimiter=',',encoding='utf-8')
dev_labels = pd.read_csv(labels_dir+'dev_split.csv',delimiter=',',encoding='utf-8')
train_ids = np.array(train_labels)[:, 0]
dev_ids = np.array(dev_labels)[:, 0]
print('train_size: ', len(train_ids))
print('dev_size: ', len(dev_ids))

In [None]:
# load transcripts into pandas dataframe without the first unnamed column
transcripts = pd.read_csv('../data/DAIC/transcripts.csv',delimiter=',',encoding='utf-8', index_col=0)
print('transcripts first 10 rows', transcripts.head(10))

In [12]:
# split transcripts into train, dev based on the id column of transcripts that equals to train_ids, dev_ids
train_transcripts = transcripts[transcripts['ID'].isin(train_ids)]
dev_transcripts = transcripts[transcripts['ID'].isin(dev_ids)]

# split train, dev transcripts into X
X_transcripts_train = train_transcripts["text"]
X_transcripts_dev = dev_transcripts["text"]

In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

def clean_text(text):
    # remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # lowercase all words
    text = text.lower()

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text = [word for word in text.split() if word not in stop_words]

    return " ".join(text)

[nltk_data] Downloading package stopwords to /home/reza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# extract bert features from text column of df_openface_created_features_per_video dataframe
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-mpnet-base-v2')

X_train = [clean_text(x) for x in X_transcripts_train]
X_dev = [clean_text(x) for x in X_transcripts_dev]

print(len(X_train))
print(len(X_dev))

# get the bert features for the text column
bert_features_train = model.encode(X_train)
bert_features_dev = model.encode(X_dev)

In [None]:
# train a SVR model on the bert_features and PHQ_Score as the target
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import numpy as np

X_train = np.array(bert_features_train)
X_dev = np.array(bert_features_dev)
y_train = np.array(train_labels[train_labels['Participant_ID'].isin(train_ids)]['PHQ_Score'])
y_dev = np.array(dev_labels[dev_labels['Participant_ID'].isin(dev_ids)]['PHQ_Score'])

# train a SVR model on X_train and y_train
svr = SVR(kernel='rbf', C=1e2, gamma=0.1)
svr.fit(X_train, y_train)

# predict on X_train and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
print('mse for train: ', mse)
print('mae for train: ', mae)

# predict on X_dev and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_dev)
mse = mean_squared_error(y_dev, y_pred)
mae = mean_absolute_error(y_dev, y_pred)
print('mse for dev: ', mse)
print('mae for dev: ', mae)