In [5]:
!pip install -q spacy
!pip install -q tensorflow
!pip install -q transformers

In [6]:
%env TF_CPP_MIN_LOG_LEVEL = 3

env: TF_CPP_MIN_LOG_LEVEL=3


In [7]:
import math
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.metrics import Precision, Recall, AUC
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, LearningRateScheduler, CallbackList, ReduceLROnPlateau
from tensorflow.keras.optimizers.experimental import Adam

# **Loading Data**

In [8]:
from google.colab import drive

drive.mount('/content/drive')

import os

path = '/content/drive/MyDrive/Colab Notebooks/Cap7'

Mounted at /content/drive


In [9]:
train_data = pd.read_csv(os.path.join(path, 'dados_treino.txt'), header = None, delimiter = ';')

test_data = pd.read_csv(os.path.join(path, 'dados_teste.txt'), header = None, delimiter = ';')

In [10]:
train_data = train_data.rename(columns = {0: 'text', 1: 'sentiment'})
test_data = test_data.rename(columns = {0: 'text', 1: 'sentiment'})

In [11]:
train_data.shape

(16000, 2)

In [12]:
test_data.shape

(2000, 2)

In [13]:
train_data.head()

Unnamed: 0,text,sentiment
0,i am feeling completely overwhelmed i have two...,fear
1,i have the feeling she was amused and delighted,joy
2,i was able to help chai lifeline with your sup...,joy
3,i already feel like i fucked up though because...,anger
4,i still love my so and wish the best for him i...,sadness


In [14]:
train_data['sentiment'].value_counts()

sentiment
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [15]:
test_data['sentiment'].value_counts()

sentiment
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

# **PreProcessing with Spacy**

In [16]:
# Download Dict
!python -m spacy download en_core_web_md -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
# Loading Dict
nlp_dict = spacy.load('en_core_web_md')

In [20]:
def preprocessing_text(text):
  # Processing text with Dict
  doc = nlp_dict(text)

  # Create an list with tokens
  tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]

  # Return tokens
  return ' '.join(tokens)

In [21]:
train_data['Processed_text'] = train_data['text'].apply(preprocessing_text)

test_data['Processed_text'] = test_data['text'].apply(preprocessing_text)

In [22]:
train_data.head()

Unnamed: 0,text,sentiment,Processed_text
0,i am feeling completely overwhelmed i have two...,fear,feel completely overwhelmed strategy help feel...
1,i have the feeling she was amused and delighted,joy,feeling amuse delight
2,i was able to help chai lifeline with your sup...,joy,able help chai lifeline support encouragement ...
3,i already feel like i fucked up though because...,anger,feel like fuck not usually eat morning
4,i still love my so and wish the best for him i...,sadness,love wish good long tolerate effect bm life fa...


In [None]:
lstm_tokenizer = Tokenizer()

lstm_tokenizer.fit_on_texts(train_data['Processed_text'])

In [None]:
word_index = lstm_tokenizer.word_index

In [None]:
# Iterates over dictionary key-value pairs
for i, (chave, valor) in enumerate(word_index.items()):
    print(chave, valor)
    if i == 9:
        break

In [None]:
# Convert text to tokens sequences
train_seq = lstm_tokenizer.texts_to_sequence(train_data['Processed_text'])

In [None]:
# Max Length of Sequence
max_length = 100

In [None]:
train_seq_pad = pad_sequences(train_seq, maxlen = max_length, truncating = 'post')

In [None]:
test_seq = lstm_tokenizer.texts_to_sequence(test_data['Processed_text'])

test_seq_pad = pad_sequences(test_seq, maxlen = max_length, truncating = 'post')

In [None]:
# convert text to Number to train the model
label_encoder = LabelEncoder()

y_train_le = label_encoder.fit_transform(train_data['sentiment'])

y_test_le = label_encoder.transform(test_data['sentiment'])

y_train_encoded = to_categorical(y_train_le)
y_test_encoded = to_categorical(y_test_le)

In [None]:
vocab_size = len(lstm_tokenizer.word_index) + 1

In [None]:
embedding_dim = max_length