In [None]:
# This notebook is based on ChatGPT outputs of transcripts using this prompt:
# Can you identify any signs or indications of depression in the following text, which is a transcript of an interview with a person?

In [None]:
import pandas as pd

# read an xslx file
chatgpt_text_df = pd.read_excel('../data/DAIC/DAIC_Chatgpt_text_1.xlsx', sheet_name='Sheet1')

# Change column names
chatgpt_text_df.columns = ['id', 'text']
chatgpt_text_df.head()

In [None]:
# import csv file
labels_dev_df = pd.read_csv('../data/DAIC/labels/dev_split.csv')
labels_dev_df.head()

# rename columns
labels_dev_df.columns = ['id', 'Gender', 'PHQ_Binary', 'PHQ_Score', 'PCL-C (PTSD)', 'PTSD Severity']
labels_dev_df.head()

# import csv file
labels_train_df = pd.read_csv('../data/DAIC/labels/train_split.csv')
labels_train_df.head()

# rename columns
labels_train_df.columns = ['id', 'Gender', 'PHQ_Binary', 'PHQ_Score', 'PCL-C (PTSD)', 'PTSD Severity']
labels_train_df.head()

# import csv file
labels_test_df = pd.read_csv('../data/DAIC/labels/test_split.csv')
labels_test_df.head()

# rename columns
labels_test_df.columns = ['id', 'Gender', 'PHQ_Binary', 'PHQ_Score', 'PCL-C (PTSD)', 'PTSD Severity']
labels_test_df.head()

In [None]:
# merge two dataframes on id
df_dev = pd.merge(labels_dev_df, chatgpt_text_df, on='id')
df_dev.head()

df_train = pd.merge(labels_train_df, chatgpt_text_df, on='id')
df_train.head()

df_test = pd.merge(labels_test_df, chatgpt_text_df, on='id')
df_test.head()

In [None]:
# extract deproberta features from column df['text']
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("rafalposwiata/deproberta-large-depression")
model = AutoModelForSequenceClassification.from_pretrained("rafalposwiata/deproberta-large-depression")

# model = SentenceTransformer('all-mpnet-base-v2')

X_train = df_train['text']
X_dev = df_dev['text']
X_test = df_test['text']

print(len(X_train))
print(len(X_dev))
print(len(X_test))

# extract features from train data
X_train_features = []
for i in range(len(X_train)):
    input_ids = torch.tensor(tokenizer.encode(X_train[i], add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    X_train_features.append(outputs[0].detach().numpy())
    if i % 100 == 0:
        print(i)

# extract features from dev data
X_dev_features = []
for i in range(len(X_dev)):
    input_ids = torch.tensor(tokenizer.encode(X_dev[i], add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    X_dev_features.append(outputs[0].detach().numpy())
    if i % 100 == 0:
        print(i)

# extract features from test data
X_test_features = []
for i in range(len(X_test)):
    input_ids = torch.tensor(tokenizer.encode(X_test[i], add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    X_test_features.append(outputs[0].detach().numpy())
    if i % 100 == 0:
        print(i)




In [None]:
# print the shape of the features
deproberta_features_train = np.array(X_train_features)
deproberta_features_dev = np.array(X_dev_features)
deproberta_features_test = np.array(X_test_features)

# reshape the features
deproberta_features_train = deproberta_features_train.reshape(deproberta_features_train.shape[0], deproberta_features_train.shape[2])
deproberta_features_dev = deproberta_features_dev.reshape(deproberta_features_dev.shape[0], deproberta_features_dev.shape[2])
deproberta_features_test = deproberta_features_test.reshape(deproberta_features_test.shape[0], deproberta_features_test.shape[2])



In [None]:
# print the shape of the bert features
print(deproberta_features_train.shape)
print(deproberta_features_dev.shape)
print(deproberta_features_test.shape)

print(deproberta_features_train[0])

In [None]:
# train a SVR model on the bert_features and PHQ_Score as the target
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

X_train = np.array(bert_features_train)
X_dev = np.array(bert_features_dev)
X_test = np.array(bert_features_test)
y_train = np.array(df_train['PHQ_Score'])
y_dev = np.array(df_dev['PHQ_Score'])
y_test = np.array(df_test['PHQ_Score'])

# train a SVR model on X_train and y_train
svr = SVR(kernel='rbf', C=3.5, gamma=0.1)
svr.fit(X_train, y_train)

# predict on X_train and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
print('rmse for train: ', np.sqrt(mse))
print('mae for train: ', mae)

# predict on X_dev and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_dev)
mse = mean_squared_error(y_dev, y_pred)
mae = mean_absolute_error(y_dev, y_pred)
print('rmse for dev: ', np.sqrt(mse))
print('mae for dev: ', mae)

# predict on X_dev and calculate the mean squared error and mean absolute error
y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('rmse for test: ', np.sqrt(mse))
print('mae for test: ', mae)

In [None]:
# find the best parameters for SVR

# define the parameter values that should be searched
C_range = [1, 2, 3, 10, 20, 30, 100, 200, 300, 1000]
gamma_range = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = dict(gamma=gamma_range, C=C_range)

# find the best parameters for SVR on the dev set

best_c = 1
best_gamma = 0.001
best_mae = 1000
for c in C_range:
    for gamma in gamma_range:
        svr = SVR(kernel='rbf', C=c, gamma=gamma)
        svr.fit(X_train, y_train)
        y_pred = svr.predict(X_dev)
        mae = mean_absolute_error(y_dev, y_pred)
        if mae < best_mae:
            best_c = c
            best_gamma = gamma
            best_mae = mae

print('best c: ', best_c)
print('best gamma: ', best_gamma)
print('best mae: ', best_mae)