## Data General Exploration 

In [1]:
import pandas as pd

# load data
df = pd.read_csv('SuicideAndDepression_Detection.csv')

In [2]:
# show first 10 rows
df.head(10)

Unnamed: 0,text,class
0,Does life actually work for most / non-depress...,depression
1,I found my friend's bodyIt was almost nine yea...,depression
2,Ex Wife Threatening SuicideRecently I left my ...,SuicideWatch
3,Am I weird I don't get affected by compliments...,teenagers
4,Finally 2020 is almost over... So I can never ...,teenagers
5,"Reddit, I've never opened up to anyone with my...",depression
6,Somebody help me.I just had a terrible episode...,depression
7,I can't do this anymoreI've hidden away all su...,depression
8,i need helpjust help me im crying so hard,SuicideWatch
9,"I’m so lostHello, my name is Adam (16) and I’v...",SuicideWatch


In [3]:
# show last 10 rows
df.tail(10)

Unnamed: 0,text,class
348114,I did something today I went sledding with my ...,teenagers
348115,when you just need a hugI've been really depre...,depression
348116,If you don't like rock then your not going to ...,teenagers
348117,I drink... A lot. And I like itI'm only 19 yea...,depression
348118,Sleep cycle does not existOn my days off I usu...,depression
348119,You how you can tell i have so many friends an...,teenagers
348120,pee probably tastes like salty tea😏💦‼️ can som...,teenagers
348121,The usual stuff you find hereI'm not posting t...,SuicideWatch
348122,"I confronted my mother. Extremely isolated, wi...",depression
348123,I still haven't beaten the first boss in Hollo...,teenagers


In [4]:
# show data dimensions
df.shape

(348124, 2)

In [5]:
# show data types
df.dtypes

text     object
class    object
dtype: object

In [6]:
# show overall data description
df.describe()

Unnamed: 0,text,class
count,348123,348110
unique,348123,3
top,Does life actually work for most / non-depress...,SuicideWatch
freq,1,116037


In [7]:
# check value counts of 3 classes
df['class'].value_counts()

SuicideWatch    116037
teenagers       116037
depression      116036
Name: class, dtype: int64

In [8]:
# check missing values
df.isnull().sum()

text      1
class    14
dtype: int64

In [9]:
# check duplicates
df.duplicated().sum()

0

## Data Cleaning

In [10]:
# change class label 
df['class'] = df['class'].replace({'teenagers' : 'normal', 'SuicideWatch' : 'suicide'})
df['class'].value_counts()

suicide       116037
normal        116037
depression    116036
Name: class, dtype: int64

In [11]:
# drop missing values
df = df.dropna()
df.isnull().sum()

text     0
class    0
dtype: int64

In [12]:
# drop duplicated although detected 0 duplicate (double check)
df = df.drop_duplicates()
df.duplicated().sum()

0

## Text Cleaning (Text Column)

In [13]:
import pandas as pd
import re
import contractions
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# change apostrophe 
def replace_apostrophe(text):
    return text.replace("’", "'")

# lowercase text 
def lowercase_text(text):
    return text.lower()

df['text'] = df['text'].apply(lowercase_text)
df['text'] = df['text'].apply(replace_apostrophe)
df.head(10)


Unnamed: 0,text,class
0,does life actually work for most / non-depress...,depression
1,i found my friend's bodyit was almost nine yea...,depression
2,ex wife threatening suiciderecently i left my ...,suicide
3,am i weird i don't get affected by compliments...,normal
4,finally 2020 is almost over... so i can never ...,normal
5,"reddit, i've never opened up to anyone with my...",depression
6,somebody help me.i just had a terrible episode...,depression
7,i can't do this anymorei've hidden away all su...,depression
8,i need helpjust help me im crying so hard,suicide
9,"i'm so losthello, my name is adam (16) and i'v...",suicide


In [15]:
import pandas as pd
import wordninja
import re

# Apply word segmentation tot he 'text' column in the DataFrame
df['text'] = df['text'].apply(lambda x: ' '.join(wordninja.split(x)))
df.head(10)

Unnamed: 0,text,class
0,does life actually work for most non depressed...,depression
1,i found my friend's body it was almost nine ye...,depression
2,ex wife threatening suicide recently i left my...,suicide
3,am i weird i don't get affected by compliments...,normal
4,finally 2020 is almost over so i can never hea...,normal
5,reddit i've never opened up to anyone with my ...,depression
6,somebody help me i just had a terrible episode...,depression
7,i can't do this anymore i've hidden away all s...,depression
8,i need help just help me im crying so hard,suicide
9,i'm so lost hello my name is adam 16 and i've ...,suicide


In [16]:
# expand the words like cnnt to cannot / ve to have and so on
def expand_contractions(text):

    expanded_text = contractions.fix(text)
    return expanded_text

df['text'] = df['text'].apply(expand_contractions)
df.head(10)


Unnamed: 0,text,class
0,does life actually work for most non depressed...,depression
1,i found my friend's body it was almost nine ye...,depression
2,ex wife threatening suicide recently i left my...,suicide
3,am i weird i do not get affected by compliment...,normal
4,finally 2020 is almost over so i can never hea...,normal
5,reddit i have never opened up to anyone with m...,depression
6,somebody help me i just had a terrible episode...,depression
7,i cannot do this anymore i have hidden away al...,depression
8,i need help just help me i am crying so hard,suicide
9,i am so lost hello my name is adam 16 and i ha...,suicide


In [17]:
# Remove words equal to or less than 2 characters in length
def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

# remove symbols
def remove_symbols(text):
    symbol_pattern = re.compile(r'[\(\)\[\]:]')
    return symbol_pattern.sub('', text)

# remove symbols and digits
def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z\s]', ' ', text)

# remove emoji
def remove_emoji(text):
    return re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001FE00-\U0001FE0F\U0001F004]+', '', text)

# remove URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

# remove extra whitespace
def remove_whitespace(text):
    return ' '.join(text.split())

# remove punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([token for token in text.split() if token.lower() not in stop_words])

# lemmatizing text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(token) for token in text.split()])

# Apply preprocessing techniques sequentially
df['text'] = df['text'].apply(remove_short_words)
df['text'] = df['text'].apply(remove_html_tags)
df['text'] = df['text'].apply(remove_urls)
df['text'] = df['text'].apply(remove_emoji)
df['text'] = df['text'].apply(remove_symbols_digits)
df['text'] = df['text'].apply(remove_symbols)
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(remove_stopwords)
df['text'] = df['text'].apply(lemmatize_text)
df['text'] = df['text'].apply(remove_whitespace)


In [18]:
df.head(10)

Unnamed: 0,text,class
0,life actually work non depressed people seem p...,depression
1,found friend body almost nine year ago still t...,depression
2,wife threatening suicide recently left wife go...,suicide
3,weird get affected compliment coming someone k...,normal
4,finally almost never hear bad year ever swear ...,normal
5,reddit never opened anyone life problem much h...,depression
6,somebody help terrible episode tonight feel ho...,depression
7,cannot anymore hidden away summer room cannot ...,depression
8,need help help cry hard,suicide
9,lost hello name adam struggling year afraid pa...,suicide


## Tokenize 

In [19]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the text column in the DataFrame
df['text'] = df['text'].apply(tokenize_text)

df.head(10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,class
0,"[life, actually, work, non, depressed, people,...",depression
1,"[found, friend, body, almost, nine, year, ago,...",depression
2,"[wife, threatening, suicide, recently, left, w...",suicide
3,"[weird, get, affected, compliment, coming, som...",normal
4,"[finally, almost, never, hear, bad, year, ever...",normal
5,"[reddit, never, opened, anyone, life, problem,...",depression
6,"[somebody, help, terrible, episode, tonight, f...",depression
7,"[can, not, anymore, hidden, away, summer, room...",depression
8,"[need, help, help, cry, hard]",suicide
9,"[lost, hello, name, adam, struggling, year, af...",suicide


In [20]:
from nltk.corpus import words
import nltk

# Download the list of English words (if not already downloaded)
nltk.download('words')

# Load the set of English words
english_words = set(words.words())

# List of words to exclude from removal
words_to_exclude = {'fuck'}  # Add your specific words here

# Function to remove non-English words from a list of tokens
def remove_non_english(tokens):
    english_tokens = [
        token if (token in english_words or token in words_to_exclude) else ''
        for token in tokens]
    return [token for token in english_tokens if token != '']

# Apply the function to the 'text' column in the DataFrame
df['text'] = df['text'].apply(remove_non_english)
df.head(10)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Unnamed: 0,text,class
0,"[life, actually, work, non, depressed, people,...",depression
1,"[found, friend, body, almost, nine, year, ago,...",depression
2,"[wife, threatening, suicide, recently, left, w...",suicide
3,"[weird, get, affected, compliment, coming, som...",normal
4,"[finally, almost, never, hear, bad, year, ever...",normal
5,"[never, anyone, life, problem, much, help, lit...",depression
6,"[somebody, help, terrible, episode, tonight, f...",depression
7,"[can, not, hidden, away, summer, room, can, no...",depression
8,"[need, help, help, cry, hard]",suicide
9,"[lost, hello, name, struggling, year, afraid, ...",suicide


In [21]:
# join the tokens data back to detoken then check duplicated as tokens data is list cant check duplicated 
df['text'] = df['text'].apply(lambda tokens: ' '.join(tokens))
df.duplicated().sum()

1497

In [22]:
# review duplicated
df[df.duplicated()]

Unnamed: 0,text,class
1662,,normal
4716,anyone want talk,normal
5511,,normal
5880,,normal
6768,,normal
...,...,...
346904,,normal
346920,,normal
347112,horny,normal
347507,anyone want chat want chat,normal


In [23]:
# review one of the duplicate row
df[df['text'] == 'horny']

Unnamed: 0,text,class
139453,horny,normal
347112,horny,normal


In [24]:
#drop duplicates
df = df.drop_duplicates()

In [36]:
# doublecheck missing values as after text cleaning might have blank data
df.isnull().sum()

text     0
class    0
dtype: int64

In [37]:
#check empty string
df[df['text'] == '']

Unnamed: 0,text,class
608,,normal
10377,,depression
11019,,suicide


In [38]:
#filter empty string
df = df[df['text'] != '']
df.shape

(346609, 2)

In [40]:
# save cleaned csv (WITHOUT TOKENIZE)
df.to_csv("cleaned_SuicideAndDepression_detection_without_Token.csv", index=False)

In [41]:
#last check without token data
df.isnull().sum()

text     0
class    0
dtype: int64

In [42]:
#last check without token data
df.duplicated().sum()

0

In [None]:
# tokens the detoken dataset
# df_token = df.copy()
# df_token['text'] = df_token['text'].apply(tokenize_text)

In [31]:
#save cleaned csv (WITH TOKENIZE FOR MODEL BUILDING)
# df_token.to_csv("cleaned_SuicideAndDepression_detection_with_Token.csv", index=False)

## EDA

In [47]:
import pandas as pd

# # load data
df = pd.read_csv('cleaned_SuicideAndDepression_detection_without_Token.csv')