# 1. Data Cleaning
This notebook aims to use perform data cleaning.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install -qqq unidecode contractions pyspellchecker wordninja symspellpy emoji

In [None]:
# imports data analysis packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# NLP Imports
import nltk
nltk.download('all')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import emoji

from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import wordninja

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [None]:
# Change to own directory
try:
    os.chdir("/content/drive/MyDrive/MyProject_SIDetection")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Load dataset

In [None]:
# Load dataset
data = pd.read_csv('Suicide_Detection.csv',index_col=0)
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [None]:
#Checking for missing data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    232074 non-null  object
 1   class   232074 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


In [None]:
#make to string and continue
data['text'] = data['text'].astype(str)

In [None]:
# Remove URL
data['text'] = data['text'].str.replace(r'http\S+', '', regex=True).str.replace(r'https\S+', '', regex=True).str.replace(r'www\S+', '', regex=True)

# Remove mentions
data['text'] = data['text'].str.replace(r'@\w+', '', regex=True)

# Remove hashtags
data['text'] = data['text'].str.replace(r'#\w+', '', regex=True)

# Remove subreddit tags
data['text'] = data['text'].str.replace(r'r/\w+', '', regex=True)

# Remove user tags
data['text'] = data['text'].str.replace(r'u/\w+', '', regex=True)

# Remove special characers
data['text'] = data['text'].str.replace('[^0-9a-zA-Z]+', ' ', regex=True)
data['text'] = data['text'].str.replace('\s+', ' ', regex=True).str.strip()

# Remove special characters (keep periods and commas for BERT)
text = re.sub(r'[^a-zA-Z0-9.,]+', ' ', text)

# Remove extra whitespace
data['text'] = data['text'].str.replace(r'\s\s+', ' ', regex=True)

In [None]:
def remove_emojis(text):
    # Find all emojis in the text
    emojis = emoji.emoji_list(text)
    # Sort emojis by start location in descending order
    for emj in sorted(emojis, key=lambda x: x['match_start'], reverse=True):
        # Replace each emoji with an empty string
        text = text[:emj['match_start']] + text[emj['match_end']:]
    return text

def processing_text(text):
    clean_list_with_emojis = []
    clean_list_without_emojis = []
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    stop_words_set = set(stopwords.words("english"))

    # Convert emojis to words
    text_with_emojis = emoji.demojize(text)

    # Remove emojis for the without_emojis text
    text_without_emojis = remove_emojis(text)

    # Tokenize and convert to lowercase
    words_with_emojis = tokenizer.tokenize(text_with_emojis.lower())
    words_without_emojis = tokenizer.tokenize(text_without_emojis.lower())

    # Light clean tokenize, remain original case
    lightclean = tokenizer.tokenize(text_with_emojis)

    # Lemmatize and remove stop words
    clean_words_with_emojis = [lemmatizer.lemmatize(word) for word in words_with_emojis if word not in stop_words_set]
    clean_words_without_emojis = [lemmatizer.lemmatize(word) for word in words_without_emojis if word not in stop_words_set]

    # Join cleaned words into a string
    clean_text_with_emojis = ' '.join(clean_words_with_emojis)
    clean_text_without_emojis = ' '.join(clean_words_without_emojis)
    lightclean_text_with_emojis = ' '.join(lightclean)

    return clean_text_with_emojis, clean_text_without_emojis, lightclean_text_with_emojis

# Apply the function to each row in the 'text' column of the DataFrame.
results = data['text'].apply(processing_text)

# Unpack the results into two separate columns
data['text_clean_with_emojis'], data['text_clean_without_emojis'], data['lightclean_text_with_emojis'] = zip(*results)

# Check the result
pd.set_option("display.max_colwidth", 100)
print(data[['text', 'text_clean_with_emojis', 'text_clean_without_emojis','lightclean_text_with_emojis']].head(8))

                                                                                                  text  \
0  Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice ...   
1  Am I weird I don t get affected by compliments if it s coming from someone I know irl but I feel...   
2  Finally 2020 is almost over So I can never hear 2020 has been a bad year ever again I swear to f...   
3                                                            i need helpjust help me im crying so hard   
4  I m so lostHello my name is Adam 16 and I ve been struggling for years and I m afraid Through th...   
5  Honetly idkI dont know what im even doing here I just feel like there is nothing and nowhere for...   
7                                                         It ends tonight I can t do it anymore I quit   

                                                                                text_clean_with_emojis  \
0  ex wife threatening suiciderecently left w

In [None]:
# After EDA, "filler" is meaningless word
# Remove anomalous "filler" word
data['text_clean_without_emojis'] = data['text_clean_without_emojis'].str.replace('filler', '')
data['text_clean_with_emojis'] = data['text_clean_with_emojis'].str.replace('filler', '')

In [None]:
#Checking on 'text_clean_with_emojis' and 'text_clean_without_emojis'
pd.set_option("display.max_colwidth", 1000)
data[['text','text_clean_with_emojis','text_clean_without_emojis']].head(2)

Unnamed: 0,text,text_clean_with_emojis,text_clean_without_emojis
0,Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her As of a few days ago she began threatening suicide I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I ll come back I know a lot of people will threaten this in order to get their way but what happens if she really does What do I do and how am I supposed to handle her death on my hands I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure I m worried today may be the day she does it and I hope so much it doesn t happen,ex wife threatening suiciderecently left wife good cheated twice lied much decided refuse go back day ago began threatening suicide tirelessly spent paat day talking keep hesitating want believe come back know lot people threaten order get way happens really supposed handle death hand still love wife cannot deal getting cheated constantly feeling insecure worried today may day hope much happen,ex wife threatening suiciderecently left wife good cheated twice lied much decided refuse go back day ago began threatening suicide tirelessly spent paat day talking keep hesitating want believe come back know lot people threaten order get way happens really supposed handle death hand still love wife cannot deal getting cheated constantly feeling insecure worried today may day hope much happen
1,Am I weird I don t get affected by compliments if it s coming from someone I know irl but I feel really good when internet strangers do it,weird get affected compliment coming someone know irl feel really good internet stranger,weird get affected compliment coming someone know irl feel really good internet stranger


In [None]:
#Checking on 'lightclean_text_with_emojis'
pd.set_option("display.max_colwidth", 1000)
data[['text','lightclean_text_with_emojis']].head(2)

Unnamed: 0,text,lightclean_text_with_emojis
0,Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her As of a few days ago she began threatening suicide I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I ll come back I know a lot of people will threaten this in order to get their way but what happens if she really does What do I do and how am I supposed to handle her death on my hands I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure I m worried today may be the day she does it and I hope so much it doesn t happen,Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her As of a few days ago she began threatening suicide I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I ll come back I know a lot of people will threaten this in order to get their way but what happens if she really does What do I do and how am I supposed to handle her death on my hands I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure I m worried today may be the day she does it and I hope so much it doesn t happen
1,Am I weird I don t get affected by compliments if it s coming from someone I know irl but I feel really good when internet strangers do it,Am I weird I don t get affected by compliments if it s coming from someone I know irl but I feel really good when internet strangers do it


In [None]:
#Added another column the 'suicide' and 'non-suicide' values to 1 and 0 respectively
data['label'] = data['class'].map({'suicide': 1, 'non-suicide': 0})

In [None]:
#checking if the function introduced missing data into new cols
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 6 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   text                         232074 non-null  object
 1   class                        232074 non-null  object
 2   text_clean_with_emojis       232074 non-null  object
 3   text_clean_without_emojis    232074 non-null  object
 4   lightclean_text_with_emojis  232074 non-null  object
 5   label                        232074 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 10.6+ MB


In [None]:
#double checking if the functions introduced missing data
data.isnull().sum()

text                           0
class                          0
text_clean_with_emojis         0
text_clean_without_emojis      0
lightclean_text_with_emojis    0
label                          0
dtype: int64

In [None]:
# Export preprocessed dataset
data.to_csv('Data_preprocessed.csv', index=False)