# BUILDING A PARALLEL DATASET OF SEEKER'S POST, NON-EMPATHETIC AND EMPATHETIC RESPONSE:

In [None]:
#License for the original datasets:

#BSD 3-Clause License

#Copyright (c) 2020, Behavioral Data Science Group
#All rights reserved.

#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:

#1. Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.

#2. Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.

#3. Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.

#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
#FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
#DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
#SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
#CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
#OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

In [None]:
#mount Google Drive for easier file management
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import statements
import pandas as pd
import string
import re
#remove limits to read all the text in the dataframe
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)

####Before running the following cells, make sure that you have downloaded the files 'emotional-reactions-reddit.csv', 'explorations-reddit.csv' and 'interpretations-reddit.csv' into the Google Drive accounted mounted to this notebook. 

In [None]:
#emotional reactions

#import csv into pandas dataframe
emo_react_df = pd.read_csv('drive/MyDrive/emotional-reactions-reddit.csv') #change this path if necessary
#rename rationales column
emo_react_df = emo_react_df.rename(columns={'rationales':'rationales1'})

#explorations

explor_df = pd.read_csv('drive/MyDrive/explorations-reddit.csv') #change this path if necessary
explor_df = explor_df.rename(columns={'rationales':'rationales2'})

#interpretations

interpret_df = pd.read_csv('drive/MyDrive/interpretations-reddit.csv') #change this path if necessary
interpret_df = interpret_df.rename(columns={'rationales':'rationales3'})

In [None]:
#select only data points where the response was strongly empathetic (assigned a score of 2)
high_emo_df = emo_react_df.loc[emo_react_df['level'] == 2]
high_explor_df = explor_df.loc[explor_df['level'] == 2]
high_interpret_df = interpret_df.loc[interpret_df['level'] == 2]

In [None]:
#create a concatenated dataframe with all the high-empathy datapoints and all the rationales
concat_df = pd.concat([high_emo_df, high_explor_df, high_interpret_df]).drop_duplicates().reset_index(drop=True)
len(concat_df)

1862

In [None]:
#replace NaN with 0 (easier to ignore them when subtracting strings)
concat_df = concat_df.fillna(0)

In [None]:
#add column to concat_df
#for each index in concat_df:
##for each rationale in rationales1, 2, 3 (if rationale not NaN):
###no_empathy_response = re.sub(rationale, '', concat_df.loc['response_post'])
###add no_empathy_response to new column


concat_df['no_empathy_response'] = ""


for i in range(0, len(concat_df)):
        x = concat_df.loc[i, 'response_post']
        if concat_df.loc[i, 'rationales1'] != 0:
          z = re.sub(concat_df.loc[i, 'rationales1'], '', x)
        if concat_df.loc[i, 'rationales2'] != 0:
          z = re.sub(concat_df.loc[i, 'rationales2'], '', x)
        if concat_df.loc[i, 'rationales3'] != 0:
          z = re.sub(concat_df.loc[i, 'rationales3'], '', x)
        concat_df.loc[i, 'no_empathy_response'] = z

In [None]:
#If no_empathy_response is empty we drop the row and reset indexes
concat_df = concat_df[concat_df.no_empathy_response != '']
concat_df = concat_df.reset_index(drop=True)
print(len(concat_df))

1408


In [None]:
#now we only keep the seeker's post, the original (empathic) response, and the 
#modified response with the empathic sentence(s) removed.
parallel_df = concat_df.drop(['sp_id',	'rp_id', 'level', 'rationales1', 'rationales2', 'rationales3'], axis=1)

In [None]:
#remove leftover punctuation and spaces if they appear at the beginning of a string

#define punctuation
punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''


for i in range(0, 10): #len(concat_df)
     #inner loop repeats a few times to make sure we remove all punctuation and spaces from the beginning
     j = 0
     while j < 4:
        parallel_df.loc[i, 'no_empathy_response'] = parallel_df.loc[i, 'no_empathy_response'][1:] if parallel_df.loc[i, 'no_empathy_response'][0] == ' ' else parallel_df.loc[i, 'no_empathy_response']   
        parallel_df.loc[i, 'no_empathy_response'] = parallel_df.loc[i, 'no_empathy_response'][1:] if parallel_df.loc[i, 'no_empathy_response'][0] in punctuation else parallel_df.loc[i, 'no_empathy_response']
        j += 1

In [None]:
#for practicality we reorder and rename the columns

parallel_df = parallel_df[['seeker_post', 'no_empathy_response', 'response_post']]
parallel_df.rename(columns = {'no_empathy_response' : 'source', 'response_post' : 'target'}, inplace = True)

In [None]:
#check final dataset
parallel_df.head()

Unnamed: 0,seeker_post,source,target
0,"I've had a hard time going back to school. Going back to school has been difficult for me, It's been around five years since I've been in a class room and I ask the teachers and the people that I go to my classes to cut me some slack for being rusty but they tell me that it doesn't matter, and my step mom doesn't want me to use the computer because she doesn't want to share. Its lead me to skipping classes because I feel like theirs no use in going to school, it would be cool to know how you would handle this situation and if you can give me any advice.","but been in a similar situation after taking some time off of school. My best advice is to take the minimum amount of classes possible so you get too overwhelmed. For me essential to plan my assignments in advance, so that I can just do things one by one and not let it all get piled up cuz then I wanna die. Also, if you need a break, take one. School will always be there but good to take care of yourself too","sorry to hear that, but been in a similar situation after taking some time off of school. My best advice is to take the minimum amount of classes possible so you get too overwhelmed. For me essential to plan my assignments in advance, so that I can just do things one by one and not let it all get piled up cuz then I wanna die. Also, if you need a break, take one. School will always be there but good to take care of yourself too"
1,I miss my mum. I'm just crying a lot today and.I'm really depressed because I miss my mum,"I lost my mum too, two years ago. but it'll get better for you, I promise.","I'm sorry, my friend. I lost my mum too, two years ago. I feel your pain but it'll get better for you, I promise."
2,Not going to kill myself but I really want to die. What is the point?? Everything is shit anyway?? I really didn't ask to be born and then my parents put this life responsibility on me. Anybody else feel like this?,"omg lol I 100% blame my parents. After taking an AP Psychology course at my school, it kind of shifted my own perspective on my depression. I know I can't claim to be an expert after reading just one textbook, but even an idiot can realize that your behavior and who you are is just totally as a result of your parents. The genetic aspects aren't their fault of course, but they are perfect human specimens, so you can imagine what 17 years of always being wrong has done to my self esteem and thought process. Strangely, being cognizant of that fact doesn't change it much. Of course, I've directly told my parents I blame them for who I am, and as perfect specimens, they told me that I can't keep blaming them for everything. There's truth to that of course, but it's rather incredible that they can find the gall to abdicate themselves of all guilt and responsibility. I'd better just get on my hands and knees and obey their every whim, since they're the ones who work and pay for my existence, regardless of whether or not I can actually do jack shit about that child labor laws, yo . Lol I had to rant.","omg lol I 100% blame my parents. After taking an AP Psychology course at my school, it kind of shifted my own perspective on my depression. I know I can't claim to be an expert after reading just one textbook, but even an idiot can realize that your behavior and who you are is just totally as a result of your parents. The genetic aspects aren't their fault of course, but they are perfect human specimens, so you can imagine what 17 years of always being wrong has done to my self esteem and thought process. Strangely, being cognizant of that fact doesn't change it much. Of course, I've directly told my parents I blame them for who I am, and as perfect specimens, they told me that I can't keep blaming them for everything. There's truth to that of course, but it's rather incredible that they can find the gall to abdicate themselves of all guilt and responsibility. I'd better just get on my hands and knees and obey their every whim, since they're the ones who work and pay for my existence, regardless of whether or not I can actually do jack shit about that child labor laws, yo . Lol sorry your post is just so relatable I had to rant."
3,"Hang myself so much I've moved up to face cutting. There are times where I would like to smash a wine glad into my face, or fantasise about having my head smashed in. Today I started cutting my face.","Why the heck do we hate ourselves?! Man such as crappy lie we have bought from seriously the pit of hell. Nothing else could be so evil as to make us not believe we are worthy and deserving of a good life just as much as everyone else. Sorry to get weird there, Anyway, I have one scar on my arm from one time that I tried that and there forever. It still makes me sad when I see it. . There will be a time in your future where you will not feel so much pain and will hopefully even feel better about yourself &amp; the future you will be so thankful if you permanently scar your face.","Why the heck do we hate ourselves?! Man such as crappy lie we have bought from seriously the pit of hell. Nothing else could be so evil as to make us not believe we are worthy and deserving of a good life just as much as everyone else. Sorry to get weird there, it just makes me mad that anyone has to feel this way about themselves. Anyway, I have one scar on my arm from one time that I tried that and there forever. It still makes me sad when I see it. so sorry for what pain you are in. There will be a time in your future where you will not feel so much pain and will hopefully even feel better about yourself &amp; the future you will be so thankful if you permanently scar your face."
4,Another twelve hour graveyard shift. And I forgot to take my meds.. It's gonna be a long night &lt;/3,Hopefully it goes by quick.,Sorry to hear that. Hopefully it goes by quick.


In [None]:
#save the final dataset to Google Drive
parallel_df.to_csv(r'drive/MyDrive/dataset.csv', index = False) #change this path if necessary

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#count unique tokens in corpus
from nltk.tokenize import word_tokenize
import csv

tokens = []

def get_data():
    with open("drive/MyDrive/dataset.csv", "r") as records:
        for record in csv.reader(records):
            yield record

data = get_data()
next(data)  # skip header

for row in data:
    for sent in row:
        for token in word_tokenize(sent):
            if token not in tokens:
                tokens.append(token)
len(tokens)

8410