## <font color='Blue'>BRAIN TUMOR CLASSIFIER REACHING MORE THAN 97% ACCURACY ON VALIDATION DATASET
</font>

### For further information about the notebook and clear explanation of each step, click the following link https://github.com/Iron486/NLP_emotions_classifier and check the README.md file. 

#### Upvote if you find this notebook useful.

### <font color='violet'>Import libraries</font>

In [2]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import statistics
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow import keras


!pip install transformers # install a library not present in Colab
!pip install contractions
!pip install spacy==2.2.3
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4==4.9.1
!pip install textblob==0.15.3
!pip install contractions
!pip install transformers

import contractions
import re
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification

### <font color='violet'>Preprocessing the datasets</font>

In [3]:
train = pd.read_csv(r'../input/emotions-dataset-for-nlp/train.txt', names=['sentences', 'emotion'], sep=';')
val = pd.read_csv(r'../input/emotions-dataset-for-nlp/val.txt', names=['sentences', 'emotion'], sep=';')
test = pd.read_csv(r'../input/emotions-dataset-for-nlp/test.txt', names=['sentences', 'emotion'], sep=';')
train.head()

In [4]:
def Lemmatizer_stop_word(sentence):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer() #look at other Lemmatizers and stemmers
    sentence = re.sub('[^A-z]', ' ', sentence)
    negative = ['not', 'neither', 'nor', 'but', 'however',
                'although', 'nonetheless', 'despite', 'except',
                        'even though', 'yet','unless']
    stop_words = [z for z in stop_words if z not in negative]
    preprocessed_tokens = [lemmatizer.lemmatize(contractions.fix(temp.lower())) for temp in sentence.split() if temp not in stop_words] #lemmatization
    return ' '.join([x for x in preprocessed_tokens]).strip()

In [5]:
nltk.download('omw-1.4')
train['sentences'] = train['sentences'].apply(lambda x: Lemmatizer_stop_word(x))
val['sentences'] = val['sentences'].apply(lambda x: Lemmatizer_stop_word(x))
test['sentences'] = test['sentences'].apply(lambda x: Lemmatizer_stop_word(x))
train.head()

### <font color='violet'>Tokenize the datasets</font>

In [7]:
max_length=43
from transformers import AutoTokenizer, TFBertModel
tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')
bert=TFBertModel.from_pretrained('bert-base-cased')
from tensorflow.keras.layers import Input, Dense
x_train = tokenizer(
    [x.split() for x in train['sentences']],
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    is_split_into_words=True,
    verbose = True)


x_val = tokenizer(
    [x.split() for x in val['sentences']],
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    is_split_into_words=True,
    verbose = True)

In [8]:
lb = LabelEncoder()
labels_train=lb.fit(train.loc[:,'emotion'].to_list())
labels_train=lb.transform(train.loc[:,'emotion'].to_list())
labels_val=lb.transform(val.loc[:,'emotion'].to_list())
labels_test=lb.transform(test.loc[:,'emotion'].to_list())

### <font color='violet'>Train the model</font>

In [9]:
tf.random.set_seed(79)
#max_len = 43
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical


input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 is the pooler_output)
x = tf.keras.layers.GlobalMaxPool1D()(embeddings)
x = Dense(138, activation='elu',kernel_initializer='GlorotNormal')(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = Dense(28,activation = 'elu',kernel_initializer='GlorotNormal')(x)

output = Dense(6,activation = 'softmax')(x)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)
model.layers[2].trainable = True


opt = Adam(
    learning_rate=5e-05, # works well with BERTs
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy']) 

#'sparse_categorical_crossentropy' for not one-hot encoded features
# summarize the model
print(model.summary())

# fit the model
early_stopping_cb=keras.callbacks.EarlyStopping(patience=2,restore_best_weights=True)
#bert_model.trainable = False

history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y =labels_train,
    validation_data = (
    {'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']}, labels_val
    ),
  epochs=3,
    batch_size=12,callbacks=[early_stopping_cb]
)


### <font color='violet'> History plot </font>

In [10]:
model.save_weights('Bert_stepword_lemmatizer.h5')
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
#plt.gca().set_xlim(0,33)
plt.gca().set_ylim(0,1)
loss, accuracy = model.evaluate({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']}, labels_val
    )
print('Accuracy: %f' % (accuracy*100))

### <font color='violet'>Load the weights</font>

In [11]:
#rebuild the model to load the weights

input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")


embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 is the pooler_output)
x = tf.keras.layers.GlobalMaxPool1D()(embeddings)
x = Dense(138, activation='elu',kernel_initializer='GlorotNormal')(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = Dense(28,activation = 'elu',kernel_initializer='GlorotNormal')(x)

output = Dense(6,activation = 'softmax')(x)
    
model_saved = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)
model_saved.layers[2].trainable = True


model_saved.load_weights('Bert_stepword_lemmatizer.h5')

### <font color='violet'>More details about the model</font>

In [12]:
model_saved.summary()

#to visualize activation functions
for i, layer in enumerate (model.layers):
    print (i, layer)
    try:
        print ("    ",layer.activation)
    except AttributeError:
        print('   no activation attribute')
#specific info about each layer
for i in range(len(model.layers)):
    print(f'{i}   {model.layers[i]}: \n{model.layers[i].get_config()} \n')
#info about optimizers
model.optimizer.get_config()        

### <font color='violet'>Tokenize the test dataset</font>

In [13]:
x_test = tokenizer(
    [x.split() for x in test['sentences']],
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    is_split_into_words=True,
    verbose = True)

### <font color='violet'>Predict on test dataset</font>

In [14]:
loss, accuracy = model.evaluate({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, labels_test
    )
print('Accuracy: %f' % (accuracy*100))

In [15]:
tf.config.experimental.list_physical_devices('GPU')

In [16]:
#class-label conversion
fin_labels=[i.replace("\n", "") for i in train.loc[:,'emotion'].to_list()]
dict(zip(labels_train,fin_labels))

### <font color='violet'>Predict a new sentence</font>

In [17]:
y=input()

In [18]:
y_s=pd.Series([y])
y_lemm=y_s.apply(lambda x: Lemmatizer_stop_word(x))
y_tok = tokenizer(
    [x.split() for x in y_lemm],
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',  #only for sentence prediction 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    is_split_into_words=True,
    verbose = True)
#labels_y=lb.transform(test.loc[:,'emotion'].to_list())
y_prob=model.predict({'input_ids':y_tok['input_ids'],'attention_mask':y_tok['attention_mask']})*100
#y_tok
class_label=y_prob.argmax(axis=-1)
lb.inverse_transform(class_label) #from class to label

In [19]:
{'input_ids':y_tok['input_ids'],'attention_mask':y_tok['attention_mask']} #bert input parameters