In [1]:
import pandas as pd
import random

In [2]:
# CORPUS_DIR = os.path.expanduser('~') + '/Documents/mental_health/suicide/'
CORPUS_DIR = "D:\depression_suicide_project\Pretrained_embedding_models\dataset\suicide"
CORPUS_NAME = 'Suicide_Detection.csv'
CORPUS_PATH = CORPUS_DIR + CORPUS_NAME
PROCESSED_CORPUS = 'Suicide_Detection_processed.csv'
CSSR_DATASET = '500_Reddit_users_posts_labels.csv'
CSSR_DIR = CORPUS_DIR + '\CSSRS'
BEST_ENTITIES = ['high school', 'mental health', 'best friend', 'feel like', 'really want', 'suicide thought',
                 'friend family']
CSSR_CAT = ['Indicator', 'Attempt','Behavior','Ideation']
CSSR_FILES = ['suicidal_indicator.csv', 'suicidal_attempt.csv', 'suicidal_behavior.csv', 'suicidal_ideation.csv']


In [3]:
def generate_dataset(classes, res):
    data = []
    category = []
    for sample in res:
        data.append(''.join(sample))
        category.append(classes)
    return pd.DataFrame(zip(data, category), columns=['Post', 'Label'])

def generate_samples(sentence_num, size):
    index = 0
    dataset = pd.DataFrame()
    
    for items in CSSR_FILES:
        frame = pd.read_csv(CSSR_DIR + '\\' + items, index_col=0)
        col_name = list(frame.columns)
        # Making the class evenly distributed
        if index == 0:
            size = 5*2
        elif index == 1:
            size = 48*2
        elif index == 2:
            size = 42*2
        else:
            size = 14*2
        for k in range(size):
            temp = [col_name[items:items + sentence_num] 
                    for items in range(0, len(col_name), sentence_num)]
            dataset = pd.concat([dataset, generate_dataset(CSSR_CAT[index], temp)])
            random.shuffle(col_name)
        index += 1
        
    return dataset.sample(frac=1)


def generate_dataframe(sent_size, size):
    x = generate_samples(sent_size, size)
#     x = pd.concat([prepare_main_dataset(), x])
#     x = x[['Post','Label','code']]
#     x['code'] = Le.fit_transform(x['Label'])
    return x

In [4]:
f = generate_dataframe(20, 100)

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\depression_suicide_project\\Pretrained_embedding_models\\dataset\\suicide\\CSSRS\\suicidal_indicator.csv'

In [55]:
f['Label'].value_counts()

Label
Indicator    770
Ideation     672
Behavior     672
Attempt      672
Name: count, dtype: int64

In [56]:
f

Unnamed: 0,Post,Label
20,Epileptic dementia with behavioral disturbanc...,Ideation
58,Fibrous tissue neoplasm of skin Haemophilus i...,Indicator
55,Mechanical complication due to skin graft fai...,Indicator
14,Hysterical blindness Right temporal atrophy v...,Ideation
13,thought about taking an overdose but I never ...,Ideation
...,...,...
0,potential injury or harm Doing risky things I...,Behavior
3,Comatose with reflexes intact Medical hospita...,Attempt
66,Cystic dermoid choristoma of skin of back Ame...,Indicator
4,institutionalized Suicide while incarcerated ...,Attempt


In [57]:
import nltk
import re
from nltk.corpus import stopwords
stop_words_nltk = set(stopwords.words('english'))

In [58]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [59]:
# Tags I want to remove from the text
removal= ['DET','ADP','SPACE', 'NUM', 'SYM', 'AUX', 'PRON','CCONJ','PUNCT','PART']

def prepare_tokens_cleaning(lines):
    return ' '.join([token.lemma_.lower() for token in nlp(lines) 
            if token.pos_ not in removal and not token.is_stop and token.is_alpha])
def remove_noisy_features(tok):
    # remove words less than 3 characters
    collect_text = []
    for i in tok:
        if len(i) > 2:
            collect_text.append(i)
    return ' '.join(collect_text)

def text_preprocessing(text):    
    # Convert words to lower case
    text = text.lower()    
    text = re.sub(r'\'', ' ', text) 

    # Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)

    # Lemmatize each word
    text = [x for x in [nltk.stem.WordNetLemmatizer().lemmatize(w, pos='v') 
                            for w in text if len(w)>1] if x not in stop_words_nltk]
    
    return text

In [61]:
df = f

In [62]:
df['post'] = df.Post.apply(lambda x: prepare_tokens_cleaning(x))
df['post'] = df.post.apply(lambda x: remove_noisy_features(x.split()))
df['post'] = df.post.apply(lambda x: ' '.join(text_preprocessing(x)))
df = df.drop(['Post'], axis=1)
df.rename(columns={'post': 'Post'}, inplace=True)

In [63]:
df

Unnamed: 0,Label,Post
20,Ideation,epileptic dementia behavioral disturbance comm...
58,Indicator,fibrous tissue neoplasm skin haemophilus infec...
55,Indicator,mechanical complication skin graft failure rej...
14,Ideation,hysterical blindness right temporal atrophy va...
13,Ideation,think take overdose specific plan intend carry...
...,...,...
0,Behavior,potential injury harm risky thing intentional ...
3,Attempt,comatose reflex intact medical hospitalization...
66,Indicator,cystic dermoid choristoma skin amebic infectio...
4,Attempt,institutionalize suicide incarcerate kill aliv...


In [64]:
df_p =pd.read_csv('D:\depression_suicide_project\Pretrained_embedding_models\dataset\suicide\CSSRS\Suicide_Detection_processed.csv', index_col=0)

In [65]:
df_p

Unnamed: 0,Label,code,Post
0,Supportive,4,viable option leave wife pain comprehension su...
1,Ideation,2,hard appreciate notion meet happy deeply love ...
2,Behavior,1,night sit ledge window contemplating jump dad ...
3,Attempt,0,try kill self fail badly cause moment realize ...
4,Ideation,2,sort thing personally welcome music suggestion...
...,...,...,...
495,Supportive,4,end feel way entire lifetime fix thing persona...
496,Indicator,3,skype end ventricular dysfunction leave good n...
497,Supportive,4,sound weird maybe distractibility sound hypera...
498,Attempt,0,know dumb sound feel hyperactive behavior dese...


In [66]:
new_df = pd.concat([df,df_p])

In [67]:
new_df

Unnamed: 0,Label,Post,code
20,Ideation,epileptic dementia behavioral disturbance comm...,
58,Indicator,fibrous tissue neoplasm skin haemophilus infec...,
55,Indicator,mechanical complication skin graft failure rej...,
14,Ideation,hysterical blindness right temporal atrophy va...,
13,Ideation,think take overdose specific plan intend carry...,
...,...,...,...
495,Supportive,end feel way entire lifetime fix thing persona...,4.0
496,Indicator,skype end ventricular dysfunction leave good n...,3.0
497,Supportive,sound weird maybe distractibility sound hypera...,4.0
498,Attempt,know dumb sound feel hyperactive behavior dese...,0.0


In [70]:
new_df =new_df.drop(['code'], axis=1)

In [73]:
new_df.to_csv(CSSR_DIR + '\\' + 'combined_CSSR_twitter_dataset.csv')

In [74]:
new_df

Unnamed: 0,Label,Post
20,Ideation,epileptic dementia behavioral disturbance comm...
58,Indicator,fibrous tissue neoplasm skin haemophilus infec...
55,Indicator,mechanical complication skin graft failure rej...
14,Ideation,hysterical blindness right temporal atrophy va...
13,Ideation,think take overdose specific plan intend carry...
...,...,...
495,Supportive,end feel way entire lifetime fix thing persona...
496,Indicator,skype end ventricular dysfunction leave good n...
497,Supportive,sound weird maybe distractibility sound hypera...
498,Attempt,know dumb sound feel hyperactive behavior dese...


In [76]:
new_df.to_csv(CSSR_DIR + '\\' + 'combined_dataset.csv')