In [None]:
pip install unidecode

In [None]:
import pandas as pd
import numpy as np
import re
from unidecode import unidecode

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import collections

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('Survey.csv')

In [None]:
df['Text'] = df['Text'].astype(str)

In [None]:
df['Label'] = df['Label'].fillna(0)

In [None]:
df['Label'] = df['Label'].astype(int)

In [None]:
df.head()

In [None]:
import re

#Pre-Processing

In [None]:
# Variavel para substituir todos os caracteres que não são letras ou espaços em branco
regex = re.compile('[^a-z\s]')
# Transforma o texto em minusculo
df['clean_text'] = df['Text'].str.lower()

# Remove todos os caracteres que não são letras ou espaços
df['clean_text'] = df['clean_text'].apply(lambda x: regex.sub('', x))
# Remove palavras com menos de 3 caracteres
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))

In [None]:
print(df['clean_text'])

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Tokenization
df['clean_text'] = df['clean_text'].apply(word_tokenize)

# Removing stopwords
stop_words = set(stopwords.words('english'))
df['clean_text'] = df['clean_text'].apply(lambda x: [word for word in x if word not in stop_words])

# Word normalization
stemmer = SnowballStemmer('english')
df['clean_text'] = df['clean_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Joining the words back into a single text
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join(x))

In [None]:
words = []
for text in df['clean_text']:
    words.extend(text.split())
word_count = collections.Counter(words)
top_words = dict(word_count.most_common(10))

# Figure Size and set background
plt.figure(figsize = (10, 6))
plt.style.use('dark_background')

# Create the Barplot
plt.bar(range(len(top_words)), list(top_words.values()), align = 'center')

# Creating a y axis with words
plt.xticks(range(len(top_words)), list(top_words.keys()))

# Grid Opacity
plt.grid(alpha = 0.5)
# Title and labels
plt.title('Top 10 most used words', fontsize = 18)
plt.xlabel('Words')
plt.ylabel('Frequency')

In [None]:
# Maximum number of words to be considered in the vocabulary
max_words = 200
# Maximum number of tokens in a sequence
max_len = 200
# Tokenizer
tokenizer = Tokenizer(num_words = max_words)
# Snap tokenizer to text data
tokenizer.fit_on_texts(df['Text'])
# Converts texts into strings of numbers
sequences = tokenizer.texts_to_sequences(df['Text'])
# Mapping words to indexes
word_index = tokenizer.word_index

In [None]:
# Sequence padding
data = pad_sequences(sequences, maxlen = max_len)

In [None]:
df['Label'] = df['Label'].round().astype(int)

In [None]:
# Converting labels to numeric format
labels = tf.keras.utils.to_categorical(df['Label'])

Model

In [None]:
from keras.optimizers import Nadam
custom_learning_rate = 0.01
optimizer = Nadam(learning_rate=custom_learning_rate)

In [None]:
from keras import backend as K

In [None]:
def specificity(y_true, y_pred):
    true_negatives = tf.reduce_sum(tf.cast(tf.math.logical_and(tf.equal(y_true, 0), tf.equal(tf.round(y_pred), 0)), dtype=tf.float32))
    possible_negatives = tf.reduce_sum(tf.cast(tf.equal(y_true, 0), dtype=tf.float32))
    return true_negatives / (possible_negatives + tf.keras.backend.epsilon())

def sensitivity(y_true, y_pred):
    true_positives = tf.reduce_sum(tf.cast(tf.math.logical_and(tf.equal(y_true, 1), tf.equal(tf.round(y_pred), 1)), dtype=tf.float32))
    possible_positives = tf.reduce_sum(tf.cast(tf.equal(y_true, 1), dtype=tf.float32))
    return true_positives / (possible_positives + tf.keras.backend.epsilon())
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
def precisionM(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_S(y_true, y_pred):
    precision = precisionM(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Model
model = Sequential()
model.add(Embedding(max_words, 128, input_length = max_len))
model.add(Bidirectional(LSTM(64, dropout = 0.3, recurrent_dropout = 0.2)))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(2, activation = 'softmax'))

# Compile the model
model.compile(loss = 'categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy',sensitivity, specificity,f1_S])

In [None]:
# Checking summary
model.summary()

In [None]:
# Fit model_Learning
history = model.fit(data, labels, validation_split = 0.3, epochs = 5, batch_size = 4)

In [None]:
#ACCURACY GRAPH
plt.style.use('seaborn')
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#Specificity
plt.style.use('seaborn')
plt.plot(history.history['specificity'], label='Training Specificity')
plt.plot(history.history['val_specificity'], label='Validation Specificity')
plt.title('Model Specificity')
plt.xlabel('Epoch')
plt.ylabel('Specificity')
plt.legend()
plt.show()

In [None]:
#Sensitivity
plt.style.use('seaborn')
plt.plot(history.history['sensitivity'], label='Training Sensitivity')
plt.plot(history.history['val_sensitivity'], label='Validation Sensitivity')
plt.title('Model Sensitivity')
plt.xlabel('Epoch')
plt.ylabel('Sensitivity')
plt.legend()
plt.show()

In [None]:
#F1_Score
plt.style.use('seaborn')
plt.plot(history.history['f1_S'], label='Training f1_S')
plt.plot(history.history['val_f1_S'], label='Validation f1_S')
plt.title('Model F1_Score')
plt.xlabel('Epoch')
plt.ylabel('f1_S')
plt.legend()
plt.show()

In [None]:
#Loss
plt.style.use('seaborn')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()