# **Importando as bibliotecas**

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

# **Leitura da base de Dados**

In [None]:
dataset = pd.read_csv('Corpus_processado2.csv', sep=';', encoding='latin-1')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   abstract  399 non-null    object
 1   classe    399 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ KB


In [None]:
example = ['antitumorpromot principl angelica keiskei potent antitumor promot activ nonpolar extract root ashitaba angelica keiskei koidz umbellifera eaten veget japan activ fraction angular furanocoumarin archangelicin sr angeloyloxydihydrooroselol linear furanocoumarin psoralen bergapten xanthotoxin chalcon hydroxyderricin xanthoangelol novel chalcon name ashitabachalcon isol compound angular type furanocoumarin chalcon suppress otetradecanoylphorbolacet tpa piincorpor phospholipid cultur cell coumarin less effect addit chalcon prove antitumorpromot activ mous skin carcinogenesi induc dimethylbenz anthracen dmba plus tpa chalcon calmodulininteract properti chalcon reveal antitumorpromot activ via modul calmodulin involv system chalcon effect prevent']

# **Count Vectorizer**


Scikit-learn's CountVectorizer é utilizado para converter uma colecção de documentos de texto para um vector de contagem de termos/token. Também permite o pré-processamento de dados de texto antes de gerar a representação vectorial. Esta funcionalidade torna-o um módulo de representação de características altamente flexível para texto.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
examplevectorizer = CountVectorizer()
examplevectorizer.fit(example)
examplevectorizer.vocabulary_ 

{'activ': 0,
 'addit': 1,
 'angelica': 2,
 'angeloyloxydihydrooroselol': 3,
 'angular': 4,
 'anthracen': 5,
 'antitumor': 6,
 'antitumorpromot': 7,
 'archangelicin': 8,
 'ashitaba': 9,
 'ashitabachalcon': 10,
 'bergapten': 11,
 'calmodulin': 12,
 'calmodulininteract': 13,
 'carcinogenesi': 14,
 'cell': 15,
 'chalcon': 16,
 'compound': 17,
 'coumarin': 18,
 'cultur': 19,
 'dimethylbenz': 20,
 'dmba': 21,
 'eaten': 22,
 'effect': 23,
 'extract': 24,
 'fraction': 25,
 'furanocoumarin': 26,
 'hydroxyderricin': 27,
 'induc': 28,
 'involv': 29,
 'isol': 30,
 'japan': 31,
 'keiskei': 32,
 'koidz': 33,
 'less': 34,
 'linear': 35,
 'modul': 36,
 'mous': 37,
 'name': 38,
 'nonpolar': 39,
 'novel': 40,
 'otetradecanoylphorbolacet': 41,
 'phospholipid': 42,
 'piincorpor': 43,
 'plus': 44,
 'potent': 45,
 'prevent': 46,
 'principl': 47,
 'promot': 48,
 'properti': 49,
 'prove': 50,
 'psoralen': 51,
 'reveal': 52,
 'root': 53,
 'skin': 54,
 'sr': 55,
 'suppress': 56,
 'system': 57,
 'tpa': 58,
 'typ

# **Transformação em Array**

In [None]:
examplevectorizer.transform(example).toarray()

array([[4, 1, 2, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1,
        1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]])

# **Divisão de Treino e Teste**

In [None]:
from sklearn.model_selection import train_test_split
review = dataset['abstract'].values
label = dataset['classe'].values
review_train, review_test, label_train, label_test = train_test_split(
review, label, test_size=0.25, random_state=1000) 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
review_vectorizer = CountVectorizer()
review_vectorizer.fit(review_train)
Xlr_train = review_vectorizer.transform(review_train)
Xlr_test  = review_vectorizer.transform(review_test)
Xlr_train 

<299x5803 sparse matrix of type '<class 'numpy.int64'>'
	with 20604 stored elements in Compressed Sparse Row format>

# **Regressão Logística**

In [None]:
from sklearn.linear_model import LogisticRegression
LRmodel = LogisticRegression()
LRmodel.fit(Xlr_train, label_train)
score = LRmodel.score(Xlr_test, label_test)
print("Accuracy:", score) 

Accuracy: 0.97


In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(review_train)
Xcnn_train = tokenizer.texts_to_sequences(review_train)
Xcnn_test = tokenizer.texts_to_sequences(review_test)
vocab_size = len(tokenizer.word_index) + 1  
print(review_train[1])
print(Xcnn_train[1]) 

periop anesthesia care narrat review discuss uptod focus avail best clinic practic periop anesthesia care bundl effect main object apprais literatur local anesthet region outcom opioid nonsteroid antiinflammatori nsaid abil decreas recurr patient undergo surgeri brief discuss addit topic periop relev anesthesiologist volatil intraven anesthet periop anxieti nutrit result publish systemat review look associ recurr region anesthesia yield inconclus insuffici evid definit benefit region anesthesia basic scienc anti effect induc local anesthet new refin anim model opioid safe periop pain manag preliminari evid nsaid essenti multimod analgesia volatil anesthet format preclin clinic propofol indic protect qualiti periop period patient uniqu environ where surgic mediat lead suppress region anesthesia techniqu indic multimod analgesia nsaid opioid local anesthet prevent pathophysiolog effect pain neuroendocrin view essenti balanc anesthesia
[702, 128, 504, 2332, 92, 284, 2333, 503, 206, 636, 8

In [None]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
Xcnn_train = pad_sequences(Xcnn_train, padding='post', maxlen=maxlen)
Xcnn_test = pad_sequences(Xcnn_test, padding='post', maxlen=maxlen)
print(Xcnn_train[0, :]) 

[ 189  564   92  565  907 1471   53    2    7  299 1248  908  375   53
    7 1471 3334   65  146   43  283   11  223   13  435  221  102 2330
  223 2329 1472   93 1818 1473  564   92  565    3 1472  102   20 2331
    1  138   41 3335  701 1249 3336   53   83  160    1  189    1   12
    3 1050 1819 1051  181  153    3 1472  189    1 1819  325  351  503
  376  102 1819  325  351    3  283   11 1472  102    3 2331    1 1049
  222   53    7  166  907  131  268 1820   19   84  635  462  352 3337
 1474  224]


In [None]:
from keras.models import Sequential
from keras import layers 

# **Criando o Modelo da CNN**

In [None]:
embedding_dim = 200
textcnnmodel = Sequential()
textcnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
textcnnmodel.add(layers.Conv1D(128, 5, activation='relu'))
textcnnmodel.add(layers.GlobalMaxPooling1D())
textcnnmodel.add(layers.Dense(10, activation='relu'))
textcnnmodel.add(layers.Dense(1, activation='sigmoid'))
textcnnmodel.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])
textcnnmodel.summary() 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 200)          1161400   
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           128128    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,290,829
Trainable params: 1,290,829
Non-trainable params: 0
______________________________________________

# **Acurácia**

In [None]:
textcnnmodel.fit(Xcnn_train, label_train,
                     epochs=10,
                     verbose=False,
                     validation_data=(Xcnn_test, label_test),
                     batch_size=10)
loss, accuracy = textcnnmodel.evaluate(Xcnn_train, label_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = textcnnmodel.evaluate(Xcnn_test, label_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy)) 

Training Accuracy: 1.0000
Testing Accuracy:  0.9500
