In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'mental-health-corpus:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2782228%2F4805127%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240403%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240403T120552Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Df3946dd3cff7845f439aee3e4ab5265e645d59539b39c3fa5613644dc7d21f149635bc320a163c3d09211a0de2c56534ca91e63de1c43dc8a19f133354bce319e96305a4925365aee1bf9430c80c745b8119840ca4bfdab7cde37f23426a54af8628efd9ee8b4c8a0fd73ba26f29773610f151c4a6848968ddb9c82e3923f3375dd9dcfb9ad9e3c9985b5cbc9175130708bd4b5d2f80aad99d721dd7d72517e43772833332208e4e91e765015e06aa661ce3c1a577a63c2bfa10b1b0f97eb3ca2379f630fb1e8079876ccb05da594cf07c4fc2739d6ca34d5cad41da9e5e5633b83d341766bfb8c66413d65f3c248935adeedcdf8d261b6f453c659e387344d5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
pip install unidecode

In [None]:
import pandas as pd
import numpy as np
import re
from unidecode import unidecode

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import collections

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

from google.colab import files
import copy

import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def cleanData(df, textLabel = 'text'):
    # Variavel para substituir todos os caracteres que não são letras ou espaços em branco
    regex = re.compile('[^a-z\s]')
    # Transforma o texto em minusculo
    df['clean_text'] = df[textLabel].str.lower()
    # Remove todos os caracteres que não são letras ou espaços
    df['clean_text'] = df['clean_text'].apply(lambda x: regex.sub('', x))
    # Remove palavras com menos de 3 caracteres
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))

    # Tokenization
    df['clean_text'] = df['clean_text'].apply(word_tokenize)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    df['clean_text'] = df['clean_text'].apply(lambda x: [word for word in x if word not in stop_words])

    # Word normalization
    stemmer = SnowballStemmer('english')
    df['clean_text'] = df['clean_text'].apply(lambda x: [stemmer.stem(word) for word in x])

    # Joining the words back into a single text
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join(x))

    return df

In [None]:
def PLOTFun(top_words):
    # Figure Size and set background
    plt.figure(figsize = (10, 6))
    plt.style.use('dark_background')

    # Create the Barplot
    plt.bar(range(len(top_words)), list(top_words.values()), align = 'center')

    # Creating a y axis with words
    plt.xticks(range(len(top_words)), list(top_words.keys()))

    # Grid Opacity
    plt.grid(alpha = 0.5)
    # Title and labels
    plt.title('Top 10 most used words', fontsize = 18)
    plt.xlabel('Words')
    plt.ylabel('Frequency')

In [None]:
def DataLabelFormation(df, max_words = 10000, max_len = 200, textLabel = 'text', labelLABEL = 'label'):
    # Maximum number of words to be considered in the vocabulary
    max_words = max_words
    # Maximum number of tokens in a sequence
    max_len = max_len
    # Tokenizer
    tokenizer = Tokenizer(num_words = max_words)
    # Snap tokenizer to text data
    tokenizer.fit_on_texts(df[textLabel])
    # Converts texts into strings of numbers
    sequences = tokenizer.texts_to_sequences(df[textLabel])
    # Mapping words to indexes
    word_index = tokenizer.word_index

    # Sequence padding
    data = pad_sequences(sequences, maxlen = max_len)

    # Converting labels to numeric format
    labels = tf.keras.utils.to_categorical(df[labelLABEL])

    return data, labels


In [None]:
from keras.optimizers import SGD
custom_learning_rate = 0.001
optimizer = SGD(learning_rate=custom_learning_rate)

In [None]:
from keras import backend as K

In [None]:
from keras.saving import register_keras_serializable

In [None]:
@register_keras_serializable()
def specificity(y_true, y_pred):
    true_negatives = tf.reduce_sum(tf.cast(tf.math.logical_and(tf.equal(y_true, 0), tf.equal(tf.round(y_pred), 0)), dtype=tf.float32))
    possible_negatives = tf.reduce_sum(tf.cast(tf.equal(y_true, 0), dtype=tf.float32))
    return true_negatives / (possible_negatives + tf.keras.backend.epsilon())
@register_keras_serializable()
def sensitivity(y_true, y_pred):
    true_positives = tf.reduce_sum(tf.cast(tf.math.logical_and(tf.equal(y_true, 1), tf.equal(tf.round(y_pred), 1)), dtype=tf.float32))
    possible_positives = tf.reduce_sum(tf.cast(tf.equal(y_true, 1), dtype=tf.float32))
    return true_positives / (possible_positives + tf.keras.backend.epsilon())
@register_keras_serializable()
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
@register_keras_serializable()
def precisionM(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
@register_keras_serializable()
def f1_S(y_true, y_pred):
    precision = precisionM(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def buildModel(max_words, max_len, embedLayerNum = 128, LSTMNum = 64, reluNum = 32, softmaxNum = 2, dropout = 0.3, recurrent_dropout=0.2, loss = 'categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy',sensitivity, specificity,f1_S]):
    # Model
    model = Sequential()
    model.add(Embedding(max_words, embedLayerNum, input_length = max_len))
    model.add(Bidirectional(LSTM(LSTMNum, dropout = dropout, recurrent_dropout = recurrent_dropout)))
    model.add(Dense(reluNum, activation = 'relu'))
    model.add(Dense(softmaxNum, activation = 'softmax'))

    # Compile the model
    model.compile(loss = loss, optimizer = optimizer, metrics = metrics)

    return model

In [None]:
def printWeight(model):
    for layer in model.layers:
        if len(layer.weights) > 1:
            weights = layer.get_weights()[0]
            biases = layer.get_weights()[1]
            print(f"Layer {layer.name} weights shape: {weights.shape}")
            print(weights)
        else:
            weights = layer.get_weights()[0]
            print(f"Layer {layer.name} weights shape: {weights.shape}")
            print(weights)

In [None]:
# read data and check labels count
df = pd.read_csv('/kaggle/input/mental-health-corpus/mental_health.csv')

df['label'].value_counts()

In [None]:
# clean data and form words matrix

df = cleanData(df)
words = []
for text in df['clean_text']:
    words.extend(text.split())
word_count = collections.Counter(words)
top_words = dict(word_count.most_common(10))

In [None]:
# plot to visulize word frequencies
PLOTFun(top_words)

In [None]:
# labelling data and tokenization/word-2-Vec formation
max_words = 10000
max_len = 200
data, labels = DataLabelFormation(df, max_words, max_len)

In [None]:
model = buildModel(max_words, max_len, embedLayerNum = 128, LSTMNum = 64, reluNum = 32, softmaxNum = 2, dropout = 0.3, recurrent_dropout=0.2, loss = 'categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy',sensitivity, specificity,f1_S])

In [None]:
history = model.fit(data, labels, validation_split = 0.2, epochs = 5, batch_size = 32)

In [None]:
import copy

In [None]:
backupmodel = copy.deepcopy(model)

In [None]:
model = copy.deepcopy(backupmodel)

In [None]:
uploaded = files.upload()

In [None]:
dfOurs.head()

In [None]:
dfOurs = pd.read_csv('Survey.csv')

LABEL = 'Label'
TEXT = 'Text'

In [None]:
dfOurs[LABEL] = dfOurs[LABEL].fillna(0)

In [None]:
dfOurs = cleanData(dfOurs, textLabel=TEXT)

wordsOurs = []
for text in dfOurs['clean_text']:
    wordsOurs.extend(text.split())
word_count_ours = collections.Counter(wordsOurs)
top_words_ours = dict(word_count_ours.most_common(10))

PLOTFun(top_words_ours)

In [None]:
# Maximum number of words to be considered in the vocabulary
max_words_Ours = 200
# Maximum number of tokens in a sequence
max_len_ours = 200
# Tokenizer
dataOurs, labelsOurs = DataLabelFormation(dfOurs, max_words_Ours, max_len_ours, textLabel = TEXT, labelLABEL = LABEL)

In [None]:
print(labelsOurs)

In [None]:
history = model.fit(dataOurs, labelsOurs, validation_split = 0.3, epochs = 5, batch_size = 8)

In [None]:
print(labelsOurs)

In [None]:
#ACCURACY GRAPH
plt.style.use('seaborn')
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#Specificity
plt.style.use('seaborn')
plt.plot(history.history['specificity'], label='Training Specificity')
plt.plot(history.history['val_specificity'], label='Validation Specificity')
plt.title('Model Specificity')
plt.xlabel('Epoch')
plt.ylabel('Specificity')
plt.legend()
plt.show()

In [None]:
#Sensitivity
plt.style.use('seaborn')
plt.plot(history.history['sensitivity'], label='Training Sensitivity')
plt.plot(history.history['val_sensitivity'], label='Validation Sensitivity')
plt.title('Model Sensitivity')
plt.xlabel('Epoch')
plt.ylabel('Sensitivity')
plt.legend()
plt.show()

In [None]:
#F1_Score
plt.style.use('seaborn')
plt.plot(history.history['f1_S'], label='Training f1_S')
plt.plot(history.history['val_f1_S'], label='Validation f1_S')
plt.title('Model F1_Score')
plt.xlabel('Epoch')
plt.ylabel('f1_S')
plt.legend()
plt.show()

In [None]:
#Loss
plt.style.use('seaborn')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()