# Collect Depression Data

Post notes:

* Triangles-2 is supposedly a follow-up study with the same subjects as Triangles-1, but with only the recovered patients. (@Riccardo is this the case?)
* Removes one row without subject ID or Task ID
* For Triangles-2, the patients had recovered. I added the Recovered column to reflect this. I am unsure if the subject IDs match between Triangles-1 and Triangles-2.


In [None]:
import pandas as pd
import numpy as np
import ntpath
import re
from os.path import join

### Utilities

In [None]:
def path_leaf(path):
    """
    Extracts the filename from a path string

    """
    head, tail = ntpath.split(path)
    return tail


### Paths

In [None]:
data_path = ""

## Save path
save_path = join(data_path, "preprocessed/Depression-All-DK-Triangles.csv")

## Raw data

# csv
path_txt_1 = join(data_path, "raw/DEPR/Depression-Controls-DK-Triangles-2-Sheet1.csv")
path_txt_2 = join(data_path, "raw/DEPR/Depression-Patients-DK-Triangles-2-Sheet1.csv")

# txt
path_txt_3 = join(data_path, "raw/DEPR/Depression-Triangles-1.txt")



## Triangles 2

### Load CSV files

In [None]:
triangles_2_controls = pd.read_csv(path_txt_1)
triangles_2_patients = pd.read_csv(path_txt_2)

In [None]:
triangles_2_controls.head(3)

In [None]:
triangles_2_patients.head(3)

### Add info

In [None]:
# Add diagnoses
triangles_2_controls["Diagnosis"] = "Control"
triangles_2_patients["Diagnosis"] = "Depression"

In [None]:
# Add data file
triangles_2_controls["File"] = path_leaf(path_txt_1)
triangles_2_patients["File"] = path_leaf(path_txt_2)

In [None]:
# Add study
triangles_2_controls["Study"] = 1
triangles_2_patients["Study"] = 1

In [None]:
triangles_2_controls["Recovered"] = 0
triangles_2_patients["Recovered"] = 1

In [None]:
triangles_2_controls["Sub File"] = None
triangles_2_patients["Sub File"] = None

In [None]:
triangles_2_controls["Trial"] = triangles_2_controls.groupby('Subject\'').cumcount()+1
triangles_2_patients["Trial"] = triangles_2_patients.groupby('Subject\'').cumcount()+1

In [None]:
triangles_2_patients.columns

In [None]:
colnames = ["Subject", "Task", "Start Time", "Transcript", "End Time", 
            "Transcriber", "Diagnosis", "File", "Study", "Recovered", "Sub File", "Trial"]

In [None]:
triangles_2_controls.columns = colnames
triangles_2_patients.columns = colnames

In [None]:
triangles_2_patients.head(3)

### Reorder columns

In [None]:
column_order = ["File", "Sub File", "Study", "Subject", "Diagnosis", "Recovered", "Trial", "Task", "Transcript", 
                "Start Time", "End Time", "Transcriber"]

In [None]:
triangles_2_controls = triangles_2_controls[column_order]
triangles_2_patients = triangles_2_patients[column_order]

In [None]:
triangles_2 = pd.concat([triangles_2_controls, triangles_2_patients])

In [None]:
triangles_2

### Remove empty tasks

In [None]:
# Must have at least 9 non-NA values to stay in
triangles_2.dropna(axis = 0, thresh = 9, inplace = True)

In [None]:
# There's one row without a Subject ID or Task ID
triangles_2.dropna(subset = ['Subject'], inplace=True)

In [None]:
triangles_2["Task"] = triangles_2["Task"].astype(int)

In [None]:
triangles_2

In [None]:
triangles_2.head(10)

## Triangles 1

In [None]:
triangles_1 = pd.read_csv(path_txt_3,sep="\t")

In [None]:
triangles_1

### Add info

In [None]:
def extract_group_and_id(w):
    # Remove .txt
    w = w[:-4]
    # Extract group and id
    match = re.match(r"([a-z]+)([0-9]+)", w, re.I)
    if match:
        items = match.groups()
    else:
        items = None
    return items

In [None]:
group_and_id = triangles_1["File"].apply(lambda x: extract_group_and_id(x))

In [None]:
triangles_1["Diagnosis"] = ["Control" if g[0] == "dc" else "Depression" for g in group_and_id]

In [None]:
triangles_1["Subject"] = [i[1] for i in group_and_id]

In [None]:
triangles_1["Recovered"] = 0

In [None]:
triangles_1["Study"] = 1

In [None]:
triangles_1["Sub File"] = triangles_1["File"]
triangles_1["File"] = path_leaf(path_txt_3)

In [None]:
triangles_1["Trial"] = triangles_1.groupby('Sub File').cumcount()+1
triangles_1["Task"] = np.nan

In [None]:
triangles_1["Transcriber"] = np.nan

In [None]:
triangles_1.columns

In [None]:
triangles_1.columns = [
    "Start Time", "Transcript", "End Time", "File", "Diagnosis", "Subject",
    "Recovered", "Study", "Sub File", "Trial", "Task", "Transcriber"]

In [None]:
triangles_1 = triangles_1[
    [
    "File", "Sub File", "Study", "Subject", "Diagnosis", "Recovered", "Trial", "Task", "Transcript", 
    "Start Time", "End Time", "Transcriber"
    ]
]

In [None]:
triangles_1.head(20)

## Collect all three data frames

In [None]:
all_triangle_data = pd.concat([triangles_1, triangles_2])

In [None]:
all_triangle_data.reset_index(inplace=True, drop=True)

In [None]:
all_triangle_data["Subject"] = all_triangle_data["Subject"].astype(int)

In [None]:
all_triangle_data

In [None]:
all_triangle_data.to_csv(save_path)