# Classificador de spam via modelo de ML

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
spam.shape

(5572, 2)

In [4]:
spam['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
previ = spam['Message']
classe = spam['Category']

In [6]:
vectorizer_tfidf = TfidfVectorizer()
previsor = vectorizer_tfidf.fit_transform(previ)
previsor.shape

(5572, 8709)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(previsor, classe, test_size=0.3)

In [8]:
forest = RandomForestClassifier(n_estimators=500)
forest.fit(x_train, y_train)

In [9]:
previsoes = forest.predict(x_test)
print(confusion_matrix(y_test, previsoes))

[[1443    2]
 [  37  190]]


In [10]:
print(accuracy_score(y_test, previsoes))
print(metrics.classification_report(y_test, previsoes))

0.9766746411483254
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1445
        spam       0.99      0.84      0.91       227

    accuracy                           0.98      1672
   macro avg       0.98      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



# Classificador de Spam via RNA

## funções de ativação: relu e sigmoid

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [12]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(spam['Category'])
display(y)

array([0, 0, 1, ..., 0, 0, 0])

In [14]:
mensagens = spam['Message'].values
x_train, x_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)
print(x_train)

['Hope you are having a good week. Just checking in'
 'Dear Voucher holder Have your next meal on us. Use the following link on your pc 2 enjoy a 2 4 1 dining experiencehttp://www.vouch4me.com/etlp/dining.asp'
 'Okey dokey, i‘ll be over in a bit just sorting some stuff out.' ...
 'Fuck cedar key and fuck her (come over anyway tho)'
 'Yo, you around? Just got my car back' 'Ok i msg u b4 i leave my house.']


In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)
display(x_train.toarray())

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
input_dim = x_train.shape[1]
output_dim =1
print(input_dim)
print(output_dim)
model = Sequential()
model.add(Dense(units=10, activation='relu', input_dim=input_dim))
model.add(Dropout(0.1))
model.add(Dense(units=8, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=output_dim, activation='sigmoid'))

7118
1


In [17]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                71190     
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 8)                 88        
                                                                 
 dropout_1 (Dropout)         (None, 8)                 0         
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 71287 (278.46 KB)
Trainable params: 71287 (278.46 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
model.fit(x_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f4494972190>

In [19]:
loss, accuracy = model.evaluate(x_test, y_test)
print('loss: ', loss)
print('Acuracia: ',accuracy)

loss:  0.013948073610663414
Acuracia:  0.9844497442245483


In [20]:
nova_previsao  = model.predict(x_test)
prev = (nova_previsao > 0.5)
print(prev[:10])

[[False]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [ True]]


In [21]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1452    0]
 [  26  194]]


## Criando o proprio Embedding

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [23]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(spam['Category'])
display(y[:10])

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [25]:
mensagens = spam['Message'].values
x_train, x_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)
print(x_train[:10])

["What's the significance?" "I'm good. Have you registered to vote?"
 'Thanks. It was only from tescos but quite nice. All gone now. Speak soon'
 'S:)no competition for him.' "I don't think he has spatula hands!"
 'Was gr8 to see that message. So when r u leaving? Congrats dear. What school and wat r ur plans.'
 "For ur chance to win £250 cash every wk TXT: PLAY to 83370. T's&C's www.music-trivia.net custcare 08715705022, 1x150p/wk."
 'Reason is if the team budget is available at last they buy the unsold players for at base rate..'
 'Yes ammae....life takes lot of turns you can only sit and try to hold the steering...'
 'I donno if they are scorable']


In [26]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(x_train)

x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

In [27]:
print(x_train[:10])

[[478, 5], [28, 58, 17, 3, 2], [169, 14, 65, 62, 43, 25, 372, 220, 52, 837, 21, 418, 247], [209, 38, 11, 122], [1, 96, 106, 63, 117], [65, 928, 2, 83, 20, 157, 23, 47, 72, 6, 561, 643, 113, 53, 460, 7, 146, 72, 35, 929], [11, 35, 329, 2, 193, 775, 182, 230, 308, 74, 321, 2, 604, 140, 508, 537, 930, 308], [838, 9, 33, 5, 9, 644, 29, 168, 114, 183, 5, 11, 29, 373], [143, 164, 309, 15, 3, 26, 62, 7, 269, 2, 839, 5], [1, 33, 114, 22]]


In [28]:
x_train = pad_sequences(x_train, padding='post', maxlen=500)
x_test = pad_sequences(x_test, padding='post', maxlen=500)

In [29]:
print(x_train[0])

[478   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   

In [30]:
input_dim = x_train.shape[1]
output_dim =1
vocab = len(token.word_index)
print(input_dim)
print(output_dim)
print(vocab)

500
1
7447


In [31]:
model = Sequential()
model.add(Embedding(input_dim=vocab, output_dim=50, input_length=500))
model.add(Flatten())
model.add(Dense(units=10, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=output_dim, activation='sigmoid'))

In [32]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           372350    
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense_3 (Dense)             (None, 10)                250010    
                                                                 
 dropout_2 (Dropout)         (None, 10)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 11        
                                                                 
Total params: 622371 (2.37 MB)
Trainable params: 622371 (2.37 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
model.fit(x_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f444423fd10>

In [34]:
loss, accuracy = model.evaluate(x_test, y_test)
print('loss: ', loss)
print('Acuracia: ',accuracy)

loss:  0.013001588173210621
Acuracia:  0.9862440228462219


In [35]:
nova_previsao  = model.predict(x_test)
prev = (nova_previsao > 0.5)
print(prev[:20])

[[False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]]


In [36]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1449    1]
 [  22  200]]
