In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import datetime
import matplotlib.pyplot as plt

In [2]:
df_depression = pd.read_csv('depression_posts_scraped.csv')
print(df_depression.shape)
df_depression.head(1)

(3903, 13)


Unnamed: 0,title,score,p_id,subreddit,url,num_comments,body,p_timestamp,c_id,comment,c_timestamp,Post_Reply,Time_to_Comment
0,Our most-broken and least-understood rules is ...,2361,doqwow,depression,https://www.reddit.com/r/depression/comments/d...,180,We understand that most people who reply immed...,2019-10-29 20:22:02,f5pot56,Understood and I apologise if I forget in the ...,2019-10-29 21:23:38,Y,0 days 01:01:36


In [3]:
# Top 10 comments by score
list(df_depression.sort_values(by = ['score'], ascending = False)['comment'][:10])

['Understood and I apologise if I forget in the future and break it',
 'Yes! I’ve called a crisis line three times in my life, all years apart. The one time when I genuinely felt understood and felt calmer at the end of the call was when the person opened by saying that there was only so much that they could do, agreed that others overpromising and underdelivering was a large contributing factor to why I was doing so poorly, and just listened to and validated my feelings without trying to “fix” them or tell me basically that I should “stop feeling that way.” I called while I was in the process of trying to access actual professional support and not finding it. And that’s what I needed in that moment, just another person to hear me out and tell me that I wasn’t crazy or overreacting by feeling the way I did, in order to help me maintain the level of calm that I needed so I could continue the search for more substantial long-term help.\n\n“Ducklings” I like that. I know the type of perso

In [4]:
# Top 10 comments by number of comments
list(df_depression.sort_values(by = ['num_comments'], ascending = False)['comment'][:10])

["I just found out my Mum has skin cancer, although we know it's not the serious one I'm still petrified.",
 'Ahh where do I begin… I lost almost everything 2 months ago the love of my life broke up with me lost my best childhood friend to suicide my father told me he no longer wants to talk to me. I’ve battled with self diagnosed depression and anxiety most of my life feels like. The older generation never understand because I’m “to young” to be depressed or stressed or have anxiety. The one person I want to pour everything out is the reason for some of my pain. It hurts everything. Oh I forgot I got laid off as well all of this happened in about a weeks span. I’m pretty good at hiding my pain behind a smile but I’ve begun to eat a lot less I’ve probably lost some weight but I’m to scared to check. Most nights I just sit in bed wanting yo cry and sometimes I do but most nights I can’t. I just want to feel again. If anyone would like to talk I’d love to',
 'Got a kidney stone and lower

In [5]:
df_depression_post = df_depression[['p_id', 'score', 'num_comments', 'p_timestamp']].drop_duplicates(keep='first')
df_depression_post.head()

Unnamed: 0,p_id,score,num_comments,p_timestamp
0,doqwow,2361,180,2019-10-29 20:22:02
174,tc5fpx,94,309,2022-03-12 07:03:44
481,u7yta2,294,57,2022-04-20 20:45:30
531,u86f8c,16,13,2022-04-21 02:31:17
544,u7ud1o,74,12,2022-04-20 17:01:14


In [6]:
df_depression_post_direct_reply = df_depression[df_depression['Post_Reply']=='Y'].groupby('p_id')['num_comments'].count()
df_depression_post_direct_reply = df_depression_post_direct_reply.to_frame().rename(columns={'num_comments': 'direct_reply_comments'})
df_depression_post = pd.merge(df_depression_post, df_depression_post_direct_reply, how = 'left', on = 'p_id')

df_depression_post['direct_comments_proportion'] = df_depression_post['direct_reply_comments']/df_depression_post['num_comments']
df_depression_post['direct_comments_proportion'] = df_depression_post['direct_comments_proportion'].fillna(0)

df_depression_post.head()

Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333


In [7]:
def calc_minutes(s):
    try:
        s = s.lstrip()
        days = int(s.split(' ', 1)[0])
        s = s.split(' ', 1)[1].lstrip().split(' ', 1)[1] + ' '
        hours = int(s[0:2])
        minutes = int(s[3:5])
        seconds = int(s[6:8])
    
        return days*1440 + hours*60 + minutes + seconds/60
    except:
        return np.NaN

In [8]:
df_depression_post_direct_reply_time = df_depression[df_depression['Post_Reply']=='Y']
df_depression_post_direct_reply_time.head()

df_depression_post_direct_reply_time['time_to_reply'] = 0

for i in range(len(df_depression_post_direct_reply_time)):
    df_depression_post_direct_reply_time['time_to_reply'].iloc[i] = calc_minutes(df_depression_post_direct_reply_time['Time_to_Comment'].iloc[i])


                                                                                      
df_depression_post_direct_reply_time = df_depression_post_direct_reply_time.groupby('p_id')['time_to_reply'].aggregate(['median', 'min'])

df_depression_post = pd.merge(df_depression_post, df_depression_post_direct_reply_time, how = 'left', on = 'p_id')
df_depression_post = df_depression_post.rename(columns={'median': 'median_direct_reply_time', 'min': 'min_reply_time'})

df_depression_post.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_depression_post_direct_reply_time['time_to_reply'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj._check_is_chained_assignment_possible()


Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion,median_direct_reply_time,min_reply_time
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889,111564.125,51.35
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411,27183.483333,44.316667
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684,308.65,7.85
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0,185.533333,1.7
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333,494.883333,13.133333


In [9]:
sw = stopwords.words('english')

df_depression['body'] = df_depression['body'].astype(str).str.replace('\n', '').str.replace('\\', '')

df_depression['body_sw'] = df_depression['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)])).astype(str)

#remove special characters, make all characters lowercase
df_depression['body_sw_p'] = df_depression['body_sw'].str.replace('\w\s]', '').str.lower()

df_depression['body_length'] = 0

for i in range(len(df_depression)):
    df_depression['body_length'].iloc[i] = len(df_depression['body'].iloc[i].split(' '))

  df_depression['body'] = df_depression['body'].astype(str).str.replace('\n', '').str.replace('\\', '')
  df_depression['body_sw_p'] = df_depression['body_sw'].str.replace('\w\s]', '').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [10]:
analyser = SentimentIntensityAnalyzer()

df_depression['Sentiment Scores'] = df_depression['body_sw'].apply(analyser.polarity_scores)
df_depression['P_Sent_Neg'] = np.NaN
df_depression['P_Sent_Neu'] = np.NaN
df_depression['P_Sent_Pos'] = np.NaN
df_depression['P_Sent_Com'] = np.NaN


for i in range(len(df_depression)):
    df_depression['P_Sent_Neg'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['neg']
    df_depression['P_Sent_Neu'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['neu']
    df_depression['P_Sent_Pos'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['pos']
    df_depression['P_Sent_Com'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['compound']

    

df_depression_post_sentiments = df_depression.groupby('p_id')[['P_Sent_Neg', 'P_Sent_Neu', 'P_Sent_Pos', 'P_Sent_Com']].mean()

df_depression_post = pd.merge(df_depression_post, df_depression_post_sentiments, how = 'left', on = 'p_id')
df_depression_post.head()

Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion,median_direct_reply_time,min_reply_time,P_Sent_Neg,P_Sent_Neu,P_Sent_Pos,P_Sent_Com
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889,111564.125,51.35,0.119,0.615,0.267,0.9973
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411,27183.483333,44.316667,0.164,0.618,0.218,0.9047
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684,308.65,7.85,0.206,0.627,0.168,-0.3612
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0,185.533333,1.7,0.366,0.223,0.411,-0.25
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333,494.883333,13.133333,0.481,0.519,0.0,-0.5719


In [11]:
df_depression['comment'] = df_depression['comment'].astype(str).str.replace('\n', '').str.replace('\\', '')

df_depression['comment_sw'] = df_depression['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)])).astype(str)

#remove special characters, make all characters lowercase
df_depression['comment_sw_p'] = df_depression['comment_sw'].str.replace('[^\w\s]', '').str.lower()


#Create boolean for deleted comments
df_depression['deleted'] = np.where(df_depression['comment']=='[deleted]', 'Y', 'N')

df_depression['comment_length'] = 0

for i in range(len(df_depression)):
    df_depression['comment_length'].iloc[i] = len(df_depression['comment'].iloc[i].split(' '))

df_depression['comment_length'] = np.where(df_depression['deleted']=='Y', np.NaN, df_depression['comment_length'])

  df_depression['comment'] = df_depression['comment'].astype(str).str.replace('\n', '').str.replace('\\', '')
  df_depression['comment_sw_p'] = df_depression['comment_sw'].str.replace('[^\w\s]', '').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [12]:
# Create series of comment length per user and merge with main dataframe
#Median for calculating average length of a command
df_depression_comment_length = df_depression.groupby('p_id')['comment_length'].median()
df_depression_comment_length = df_depression_comment_length.to_frame().rename(columns={'comment_length': 'median_comment_length'})

df_depression_post = pd.merge(df_depression_post, df_depression_comment_length, how = 'left', on = 'p_id')
df_depression_post.head()

Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion,median_direct_reply_time,min_reply_time,P_Sent_Neg,P_Sent_Neu,P_Sent_Pos,P_Sent_Com,median_comment_length
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889,111564.125,51.35,0.119,0.615,0.267,0.9973,33.5
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411,27183.483333,44.316667,0.164,0.618,0.218,0.9047,41.0
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684,308.65,7.85,0.206,0.627,0.168,-0.3612,19.0
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0,185.533333,1.7,0.366,0.223,0.411,-0.25,13.0
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333,494.883333,13.133333,0.481,0.519,0.0,-0.5719,17.5


In [13]:
#polarity score is the method 
df_depression['Sentiment Scores'] = df_depression['comment_sw'].apply(analyser.polarity_scores)
df_depression['C_Sent_Neg'] = np.NaN
df_depression['C_Sent_Neu'] = np.NaN
df_depression['C_Sent_Pos'] = np.NaN
df_depression['C_Sent_Com'] = np.NaN


for i in range(len(df_depression)):
    df_depression['C_Sent_Neg'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['neg']
    df_depression['C_Sent_Neu'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['neu']
    df_depression['C_Sent_Pos'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['pos']
    df_depression['C_Sent_Com'].iloc[i] = df_depression['Sentiment Scores'].iloc[i]['compound']

df_depression['C_Sent_Neg'] = np.where(df_depression['comment']=='[deleted]', np.NaN, df_depression['C_Sent_Neg'])
df_depression['C_Sent_Neu'] = np.where(df_depression['comment']=='[deleted]', np.NaN, df_depression['C_Sent_Neu'])
df_depression['C_Sent_Pos'] = np.where(df_depression['comment']=='[deleted]', np.NaN, df_depression['C_Sent_Pos'])
df_depression['C_Sent_Com'] = np.where(df_depression['comment']=='[deleted]', np.NaN, df_depression['C_Sent_Com'])
    
    
df_depression_comment_sentiments = df_depression.groupby('p_id')[['C_Sent_Neg', 'C_Sent_Neu', 'C_Sent_Pos', 'C_Sent_Com']].median()

df_depression_post = pd.merge(df_depression_post, df_depression_comment_sentiments, how = 'left', on = 'p_id')
df_depression_post.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion,median_direct_reply_time,min_reply_time,P_Sent_Neg,P_Sent_Neu,P_Sent_Pos,P_Sent_Com,median_comment_length,C_Sent_Neg,C_Sent_Neu,C_Sent_Pos,C_Sent_Com
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889,111564.125,51.35,0.119,0.615,0.267,0.9973,33.5,0.107,0.626,0.22,0.36045
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411,27183.483333,44.316667,0.164,0.618,0.218,0.9047,41.0,0.203,0.569,0.197,0.0
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684,308.65,7.85,0.206,0.627,0.168,-0.3612,19.0,0.183,0.5485,0.183,0.0
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0,185.533333,1.7,0.366,0.223,0.411,-0.25,13.0,0.067,0.395,0.448,0.4767
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333,494.883333,13.133333,0.481,0.519,0.0,-0.5719,17.5,0.244,0.5235,0.202,-0.05135


In [14]:
df_depression_post['Diff_Sent_Neg'] = df_depression_post['C_Sent_Neg'] - df_depression_post['P_Sent_Neg']
df_depression_post['Diff_Sent_Neu'] = df_depression_post['C_Sent_Neu'] - df_depression_post['P_Sent_Neu']
df_depression_post['Diff_Sent_Pos'] = df_depression_post['C_Sent_Pos'] - df_depression_post['P_Sent_Pos']
df_depression_post['Diff_Sent_Com'] = df_depression_post['C_Sent_Com'] - df_depression_post['P_Sent_Com']

df_depression_post.head()

Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion,median_direct_reply_time,min_reply_time,P_Sent_Neg,P_Sent_Neu,...,P_Sent_Com,median_comment_length,C_Sent_Neg,C_Sent_Neu,C_Sent_Pos,C_Sent_Com,Diff_Sent_Neg,Diff_Sent_Neu,Diff_Sent_Pos,Diff_Sent_Com
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889,111564.125,51.35,0.119,0.615,...,0.9973,33.5,0.107,0.626,0.22,0.36045,-0.012,0.011,-0.047,-0.63685
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411,27183.483333,44.316667,0.164,0.618,...,0.9047,41.0,0.203,0.569,0.197,0.0,0.039,-0.049,-0.021,-0.9047
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684,308.65,7.85,0.206,0.627,...,-0.3612,19.0,0.183,0.5485,0.183,0.0,-0.023,-0.0785,0.015,0.3612
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0,185.533333,1.7,0.366,0.223,...,-0.25,13.0,0.067,0.395,0.448,0.4767,-0.299,0.172,0.037,0.7267
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333,494.883333,13.133333,0.481,0.519,...,-0.5719,17.5,0.244,0.5235,0.202,-0.05135,-0.237,0.0045,0.202,0.52055


In [15]:
#removes extra spaces,new lines and divides sentence into words
#lemmatize tells context to the words
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [16]:
df_depression['body_lemmatized'] = ''
df_depression['comment_lemmatized'] = ''

for i in range(len(df_depression)):
    df_depression['body_lemmatized'].iloc[i] = lemmatize_text(df_depression['body_sw_p'].iloc[i])
    df_depression['comment_lemmatized'].iloc[i] = lemmatize_text(df_depression['comment_sw_p'].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [26]:
neg_feelings = ['sad', 'bitter', 'regret', 'hate', 'hopeless', 'exhausted', 'numb', 'tired',
                'depressed', 'alone', 'lonely', 'isolate', 'shitty', 'failure', 'cry',
                'worthless', 'empty', 'toxic', 'prison', 'torture', 'boring', 'monotonous',
                'sucks', 'pointless', 'nothing', 'unhappy', 'meaningless', 'anxiety', 'invisibile',
                'abusive', 'struggle']

suicide_act = ['kill', 'die', 'rope', 'knife', 'pills', 'hang', 'cut', 'suicide']

goal = ['motivation', 'care', 'achieve', 'happy', 'strong', 'proud', 'socialize', 'hope',
        'excited', 'bath', 'shower', 'family', 'help', 'improvement', 'flush']

medical = ['antidepressant', 'therapy', 'therapy', 'therapist', 'psychiatrist', 'medicate', 'medicine']

fps = ['i', "i'm", 'im', 'me', 'myself', 'my']



neg_feelings = " ".join(neg_feelings)
suicide_act = " ".join(suicide_act)
goal = " ".join(goal)
medical = " ".join(medical)
fps = " ".join(fps)

In [27]:
df_depression['C_Neg_Feelings'] = 0 
df_depression['C_Suicide_Act'] = 0
df_depression['C_Goal'] = 0
df_depression['C_Medical'] = 0
df_depression['C_FPS'] = 0

df_depression['P_Neg_Feelings'] = 0 
df_depression['P_Suicide_Act'] = 0
df_depression['P_Goal'] = 0
df_depression['P_Medical'] = 0
df_depression['P_FPS'] = 0

In [28]:
cols_c = ['C_Neg_Feelings', 'C_Suicide_Act', 'C_Goal', 'C_Medical', 'C_FPS']
cols_p = ['P_Neg_Feelings', 'P_Suicide_Act', 'P_Goal', 'P_Medical', 'P_FPS']

lists = [neg_feelings, suicide_act, goal, medical, fps]


for i in range(len(cols_c)):
    for j in range(len(df_depression['comment_lemmatized'])):
        
        count = 0
        for k in range(len(df_depression['comment_lemmatized'].iloc[j])):
            if(df_depression['comment_lemmatized'].iloc[j][k].lower() in lists[i]):
                count += 1
        if(len(df_depression['comment_lemmatized'].iloc[j])==0):
            df_depression[cols_c[i]].iloc[j] = 0
        else:
            df_depression[cols_c[i]].iloc[j] = count/len(df_depression['comment_lemmatized'].iloc[j])

            
for i in range(len(cols_p)):
    for j in range(len(df_depression['body_lemmatized'])):
        
        count = 0
        for k in range(len(df_depression['body_lemmatized'].iloc[j])):
            if(df_depression['body_lemmatized'].iloc[j][k].lower() in lists[i]):
                count += 1
        if(len(df_depression['body_lemmatized'].iloc[j])==0):
            df_depression[cols_p[i]].iloc[j] = 0
        else:
            df_depression[cols_p[i]].iloc[j] = count/len(df_depression['body_lemmatized'].iloc[j])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [20]:
df_depression_frequencies = df_depression.groupby('p_id')['C_Neg_Feelings', 'C_Suicide_Act', 'C_Goal', 
                                                          'C_Medical', 'C_FPS', 'P_Neg_Feelings', 'P_Suicide_Act', 
                                                          'P_Goal', 'P_Medical', 'P_FPS'].median()

df_depression_post = pd.merge(df_depression_post, df_depression_frequencies, how = 'left', on = 'p_id')

  df_depression_frequencies = df_depression.groupby('p_id')['C_Neg_Feelings', 'C_Suicide_Act', 'C_Goal',


In [21]:
df_depression_post.head(5)

Unnamed: 0,p_id,score,num_comments,p_timestamp,direct_reply_comments,direct_comments_proportion,median_direct_reply_time,min_reply_time,P_Sent_Neg,P_Sent_Neu,...,C_Neg_Feelings,C_Suicide_Act,C_Goal,C_Medical,C_FPS,P_Neg_Feelings,P_Suicide_Act,P_Goal,P_Medical,P_FPS
0,doqwow,2361,180,2019-10-29 20:22:02,70.0,0.388889,111564.125,51.35,0.119,0.615,...,0.1,0.044949,0.086582,0.030303,0.030303,0.016317,0.02331,0.025641,0.002331,0.0
1,tc5fpx,94,309,2022-03-12 07:03:44,229.0,0.7411,27183.483333,44.316667,0.164,0.618,...,0.176471,0.096386,0.153846,0.096774,0.121951,0.008547,0.017094,0.012821,0.008547,0.0
2,u7yta2,294,57,2022-04-20 20:45:30,27.0,0.473684,308.65,7.85,0.206,0.627,...,0.166667,0.048368,0.088178,0.029608,0.0,0.088235,0.029412,0.117647,0.029412,0.088235
3,u86f8c,16,13,2022-04-21 02:31:17,13.0,1.0,185.533333,1.7,0.366,0.223,...,0.142857,0.0,0.111111,0.0,0.0,0.153846,0.230769,0.153846,0.153846,0.153846
4,u7ud1o,74,12,2022-04-20 17:01:14,10.0,0.833333,494.883333,13.133333,0.481,0.519,...,0.168182,0.064394,0.071795,0.0,0.0,0.428571,0.285714,0.285714,0.285714,0.285714


In [22]:
df_depression_post.shape

(983, 31)

In [23]:
df_depression.head(5)

Unnamed: 0,title,score,p_id,subreddit,url,num_comments,body,p_timestamp,c_id,comment,...,C_Neg_Feelings,C_Suicide_Act,C_Goal,C_Medical,C_FPS,P_Neg_Feelings,P_Suicide_Act,P_Goal,P_Medical,P_FPS
0,Our most-broken and least-understood rules is ...,2361,doqwow,depression,https://www.reddit.com/r/depression/comments/d...,180,We understand that most people who reply immed...,2019-10-29 20:22:02,f5pot56,Understood and I apologise if I forget in the ...,...,0.285714,0.285714,0.285714,0.285714,0.285714,0.016317,0.02331,0.025641,0.002331,0.0
1,Our most-broken and least-understood rules is ...,2361,doqwow,depression,https://www.reddit.com/r/depression/comments/d...,180,We understand that most people who reply immed...,2019-10-29 20:22:02,f5pot7j,I agree wholeheartedly. When you're strugglin...,...,0.087591,0.072993,0.072993,0.065693,0.058394,0.016317,0.02331,0.025641,0.002331,0.0
2,Our most-broken and least-understood rules is ...,2361,doqwow,depression,https://www.reddit.com/r/depression/comments/d...,180,We understand that most people who reply immed...,2019-10-29 20:22:02,f647tsy,Biggest Problem on private talks may be that y...,...,0.1,0.033333,0.033333,0.033333,0.0,0.016317,0.02331,0.025641,0.002331,0.0
3,Our most-broken and least-understood rules is ...,2361,doqwow,depression,https://www.reddit.com/r/depression/comments/d...,180,We understand that most people who reply immed...,2019-10-29 20:22:02,f5pnusx,I have to agree with this. I know that people ...,...,0.148936,0.06383,0.106383,0.06383,0.06383,0.016317,0.02331,0.025641,0.002331,0.0
4,Our most-broken and least-understood rules is ...,2361,doqwow,depression,https://www.reddit.com/r/depression/comments/d...,180,We understand that most people who reply immed...,2019-10-29 20:22:02,f5pq8wf,Great rule! I’ve never thought about things yo...,...,0.125,0.046053,0.098684,0.026316,0.026316,0.016317,0.02331,0.025641,0.002331,0.0


In [24]:
df_depression.shape

(3903, 41)

In [25]:
df_depression_post.to_csv('depression_threads.csv')
df_depression.to_csv('depression_comments.csv')