In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('../data/Suicide_Detection.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
...,...,...
232069,If you don't like rock then your not going to ...,non-suicide
232070,You how you can tell i have so many friends an...,non-suicide
232071,pee probably tastes like salty tea😏💦‼️ can som...,non-suicide
232072,The usual stuff you find hereI'm not posting t...,suicide


In [3]:
# Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
# Text cleaning
res = df.text

docs = []

for i in range(0, len(res)):
    # Remove URL
    txt = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))','', str(res[i]))
    # Also remove hashtags since in reddit they are a kind of link
    txt = re.sub(r'#([^\s]+)', '', txt)
    # Remove all the special characters
    txt = re.sub(r'\W', ' ', txt)
    # Remove numeric
    txt = re.sub(r"\d+", "", txt)
    # There are words that are glueTogetherSinceRedditCommentFormattingIsWeird
    match = re.search(r'[a-z][A-Z]', txt)
    if match:
        idx = match.start()
        txt = txt[:idx+1]+' '+txt[idx+1:]
    # remove all single characters
    txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
    # Remove single characters from the start
    txt = re.sub(r'\^[a-zA-Z]\s+', ' ', txt) 
    # Substituting multiple spaces with single space
    txt = re.sub(r'\s+', ' ', txt, flags=re.I)
    # Converting to Lowercase
    txt = txt.lower()
    # Remove whitespace at begin and end string
    txt = txt.strip()
    
    docs.append(txt)

out = docs
out

['ex wife threatening suicide recently left my wife for good because she has cheated on me twice and lied to me so much that have decided to refuse to go back to her as of few days ago she began threatening suicide have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe ll come back know lot of people will threaten this in order to get their way but what happens if she really does what do do and how am supposed to handle her death on my hands still love my wife but cannot deal with getting cheated on again and constantly feeling insecure m worried today may be the day she does it and hope so much it doesn happen',
 'am weird don get affected by compliments if it coming from someone know irl but feel really good when internet strangers do it',
 'finally is almost over so can never hear has been bad year ever again swear to fucking god it so annoying',
 'i need helpjust help me im crying so hard',
 'i so lost hello my name is 

In [5]:
# Remove stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

docs1 = []
for i in range(0, len(out)):
    text = out[i].split()
    text = [word for word in text if word not in stopwords]
    text = ' '.join(text)
    docs1.append(text)

out2 = docs1

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\84359\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
out2

['ex wife threatening suicide recently left wife good cheated twice lied much decided refuse go back days ago began threatening suicide tirelessly spent paat days talking keeps hesitating wants believe come back know lot people threaten order get way happens really supposed handle death hands still love wife cannot deal getting cheated constantly feeling insecure worried today may day hope much happen',
 'weird get affected compliments coming someone know irl feel really good internet strangers',
 'finally almost never hear bad year ever swear fucking god annoying',
 'need helpjust help im crying hard',
 'lost hello name adam struggling years afraid past years thoughts suicide fear anxiety close limit quiet long scared come family feelings years ago losing aunt triggered everyday feeling hopeless lost guilty remorseful things done life thoughts like little experienced life time revealed feelings family broke saw cuts watching get worried something portrayed average day made feel absolu

In [7]:
# Stemming
stemmer = PorterStemmer()

docs2 = []
for i in range(0, len(out2)):
    text = out2[i].split()
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    docs2.append(text)

In [8]:
out3 = docs2
out3

['ex wife threaten suicid recent left wife good cheat twice lie much decid refus go back day ago began threaten suicid tirelessli spent paat day talk keep hesit want believ come back know lot peopl threaten order get way happen realli suppos handl death hand still love wife cannot deal get cheat constantli feel insecur worri today may day hope much happen',
 'weird get affect compliment come someon know irl feel realli good internet stranger',
 'final almost never hear bad year ever swear fuck god annoy',
 'need helpjust help im cri hard',
 'lost hello name adam struggl year afraid past year thought suicid fear anxieti close limit quiet long scare come famili feel year ago lose aunt trigger everyday feel hopeless lost guilti remors thing done life thought like littl experienc life time reveal feel famili broke saw cut watch get worri someth portray averag day made feel absolut dread later found attempt survivor attempt od overdos pill attempt hang happen blackout pill never went noos s

In [9]:
# Export clean text
df['clean_text'] = out3
df.to_csv('../data/clean_text.csv',index=False)