In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense,Dropout,Flatten,Embedding
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [8]:
spam = pd.read_csv('spam.csv')
spam.head(2)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [9]:
lber = LabelEncoder()
y = lber.fit_transform(spam['Category'])
y

array([0, 0, 1, ..., 0, 0, 0])

In [13]:
msg = spam['Message'].values
x_train,xtest,y_train,ytest = train_test_split(msg,y,test_size = 0.3,random_state = 0)

In [14]:
token = Tokenizer(num_words = 1000)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
xtest = token.texts_to_sequences(xtest)

In [15]:
x_train

[[48, 22, 3, 32, 849, 44, 18, 3],
 [295, 31, 29, 11, 117, 737, 8],
 [51, 79, 25, 6, 274, 10, 46, 448, 42, 6],
 [215, 20, 64, 1, 514, 167, 64],
 [1, 789, 6, 145, 95, 20, 94, 248, 565],
 [38, 7, 566, 113, 1, 62, 641, 175],
 [488, 515, 28, 17, 2, 924, 18, 2],
 [30, 87, 47, 179, 11, 567, 75, 84, 344],
 [103, 54, 167, 20, 64, 1, 62, 44],
 [137, 90, 1, 102, 466, 15, 7],
 [52, 88, 677, 677, 9, 254, 23, 38, 158],
 [345, 49, 541, 12, 168, 242],
 [51, 1, 568, 13, 199, 48, 678],
 [53, 131, 569, 50, 570, 183, 679, 191],
 [136, 31, 489, 23, 31, 680, 15, 6, 23, 150, 86, 6, 159],
 [850, 255, 10, 164, 36, 28],
 [275,
  8,
  11,
  308,
  3,
  27,
  27,
  276,
  27,
  27,
  385,
  27,
  242,
  7,
  354,
  27,
  681,
  571,
  54,
  167,
  103],
 [370, 42, 5, 60, 58, 27, 11, 467, 7, 47, 1, 68, 9, 6],
 [49],
 [1, 59, 12, 82, 28, 3, 17, 106, 8, 400],
 [87, 256],
 [6, 2, 133, 58, 26, 925, 216, 490, 58, 5],
 [21, 23, 52, 28, 3, 59, 13, 12, 333, 1, 62, 12, 333, 1],
 [682, 334, 572, 64, 371, 19],
 [103,
  683,


In [16]:
x_train = pad_sequences(x_train,padding = 'post',maxlen = 500)
xtest = pad_sequences(xtest,padding = 'post',maxlen = 500)

In [17]:
x_train

array([[ 48,  22,   3, ...,   0,   0,   0],
       [295,  31,  29, ...,   0,   0,   0],
       [ 51,  79,  25, ...,   0,   0,   0],
       ...,
       [ 13,  27, 278, ...,   0,   0,   0],
       [ 86,   6,   8, ...,   0,   0,   0],
       [490,  88,   1, ...,   0,   0,   0]])

In [18]:
len(token.word_index)

7551

In [23]:
modelo = Sequential()

modelo.add(Embedding(input_dim = len(token.word_index),output_dim = 50,input_length = 500))
modelo.add(Flatten())
modelo.add(Dense(units = 10,activation = 'relu'))
modelo.add(Dropout(0.1))
modelo.add(Dense(units=1,activation = 'sigmoid'))
modelo.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 50)           377550    
                                                                 
 flatten_2 (Flatten)         (None, 25000)             0         
                                                                 
 dense_3 (Dense)             (None, 10)                250010    
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 11        
                                                                 
Total params: 627,571
Trainable params: 627,571
Non-trainable params: 0
_________________________________________________________________


In [24]:
modelo.compile(loss = 'mean_squared_error',optimizer='adam',metrics = ['accuracy'])

In [26]:
modelo.fit(x_train,y_train, epochs = 20, batch_size = 10, verbose = True, validation_data = (xtest,ytest))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x200590be190>

In [27]:
loss, accuracy = modelo.evaluate(xtest,ytest)
print('loss: ',loss,'\n','acuracia: ',accuracy)

loss:  0.014415722340345383 
 acuracia:  0.9844497442245483


In [29]:
newprev = modelo.predict(xtest)
newprev

array([[3.7726443e-11],
       [9.9946940e-01],
       [2.9394934e-08],
       ...,
       [6.0024452e-09],
       [4.1820099e-11],
       [4.3546162e-09]], dtype=float32)

In [30]:
prev = (newprev>0.5)
prev

array([[False],
       [ True],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [31]:
cm = confusion_matrix(ytest,prev)
cm

array([[1445,    6],
       [  20,  201]], dtype=int64)

# Atividade 10 - RN2

In [32]:
from keras.layers import Conv1D, MaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D

In [84]:
model = Sequential()


model = Sequential()
model.add(Embedding(input_dim=len(token.word_index), output_dim=50, input_length=500))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 500, 50)           377550    
                                                                 
 conv1d_5 (Conv1D)           (None, 496, 128)          32128     
                                                                 
 global_max_pooling1d_4 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_14 (Dense)            (None, 10)                1290      
                                                                 
 dense_15 (Dense)            (None, 1)                 11        
                                                                 
Total params: 410,979
Trainable params: 410,979
Non-trainable params: 0
_______________________________________________

In [85]:
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [86]:
model.fit(x_train, y_train, batch_size = 20, epochs=10, validation_data = (xtest,ytest), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2005f90b280>

In [87]:
loss, accuracy = model.evaluate(xtest,ytest)
print('loss: ',loss,'\n','acuracia: ',accuracy)

loss:  0.07506866753101349 
 acuracia:  0.9874401688575745


In [88]:
newprev = model.predict(xtest)
prev = (newprev>0.5)


In [89]:
cm = confusion_matrix(ytest,prev)
cm

array([[1444,    7],
       [  14,  207]], dtype=int64)