In [None]:
## The dataset is a collection of posts from the "SuicideWatch" and "depression" subreddits of the Reddit platform. The posts are collected using Pushshift API. 
## All posts that were made to "SuicideWatch" from Dec 16, 2008(creation) till Jan 2, 2021, were collected while "depression" posts were collected from Jan 1, 2009, to Jan 2, 2021. All posts collected from SuicideWatch are labeled as suicide, While posts collected from the depression subreddit are labeled as depression. 
## Non-suicide posts are collected from r/teenagers.

In [1]:
import pandas as pd

In [16]:
import nltk
import re
from nltk.corpus import stopwords
stop_words_nltk = set(stopwords.words('english'))

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
# Tags I want to remove from the text
removal= ['DET','ADP','SPACE', 'NUM', 'SYM', 'AUX', 'PRON','CCONJ','PUNCT','PART']

def prepare_tokens_cleaning(lines):
    return ' '.join([token.lemma_.lower() for token in nlp(lines) 
            if token.pos_ not in removal and not token.is_stop and token.is_alpha])
def remove_noisy_features(tok):
    # remove words less than 3 characters
    collect_text = []
    for i in tok:
        if len(i) > 2:
            collect_text.append(i)
    return ' '.join(collect_text)

def text_preprocessing(text):    
    # Convert words to lower case
    text = text.lower()    
    text = re.sub(r'\'', ' ', text) 

    # Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)

    # Lemmatize each word
    text = [x for x in [nltk.stem.WordNetLemmatizer().lemmatize(w, pos='v') 
                            for w in text if len(w)>1] if x not in stop_words_nltk]
    
    return text

In [7]:
file_path = 'D:/depression_suicide_project/Pretrained_embedding_models/dataset/suicide/CSSRS/reddit_sui_vs_dep'

In [8]:
file = file_path + '/' + 'Suicide_Detection.csv'

In [10]:
df = pd.read_csv(file, index_col=0)

In [13]:
df['class'].value_counts()

class
suicide        116037
non-suicide    116037
Name: count, dtype: int64

In [11]:
df

Unnamed: 0,text,class
2,Ex Wife Threatening SuicideRecently I left my ...,suicide
3,Am I weird I don't get affected by compliments...,non-suicide
4,Finally 2020 is almost over... So I can never ...,non-suicide
8,i need helpjust help me im crying so hard,suicide
9,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv...",suicide
...,...,...
348103,If you don't like rock then your not going to ...,non-suicide
348106,You how you can tell i have so many friends an...,non-suicide
348107,pee probably tastes like salty teaüòèüí¶‚ÄºÔ∏è can som...,non-suicide
348108,The usual stuff you find hereI'm not posting t...,suicide


In [18]:
df['post'] = df.text.apply(lambda x: prepare_tokens_cleaning(x))
df['post'] = df.post.apply(lambda x: ' '.join(text_preprocessing(x)))
df['post'] = df.post.apply(lambda x: remove_noisy_features(x.split()))
# df = df.drop(['Post'], axis=1)
# df.rename(columns={'post': 'Post'}, inplace=True)

In [19]:
df.to_csv('processed_dataset_reddit_suicide_vs_depression.csv')

In [5]:
df = pd.read_csv('D:/depression_suicide_project/Pretrained_embedding_models/dataset/suicide/CSSRS/processed_dataset_reddit_suicide_vs_depression.csv', index_col=0)

In [6]:
df

Unnamed: 0,text,class,post
2,Ex Wife Threatening SuicideRecently I left my ...,suicide,wife threaten suiciderecently leave wife good ...
3,Am I weird I don't get affected by compliments...,non-suicide,weird affect compliment come know irl feel goo...
4,Finally 2020 is almost over... So I can never ...,non-suicide,finally hear bad year swear fuck god annoy
8,i need helpjust help me im crying so hard,suicide,need helpjust help cry hard
9,"I‚Äôm so lostHello, my name is Adam (16) and I‚Äôv...",suicide,losthello adam struggle year afraid past year ...
...,...,...,...
348103,If you don't like rock then your not going to ...,non-suicide,like rock
348106,You how you can tell i have so many friends an...,non-suicide,tell friend lonely deprive pre buy little nigh...
348107,pee probably tastes like salty teaüòèüí¶‚ÄºÔ∏è can som...,non-suicide,pee probably taste salty tea drink pee confirm
348108,The usual stuff you find hereI'm not posting t...,suicide,usual stuff find post sympathy pity know far b...
