In [10]:
import pathlib
import pandas as pd
import random 
import pickle

BASE_DIR = pathlib.Path().resolve().parent #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API
SMS_SPAM_DIR = pathlib.Path().resolve()    #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/SMS-SPAM
DATASETS_DIR = BASE_DIR / 'Datasets' #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets

ZIPS_DIR = DATASETS_DIR / 'Zips'     #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets/Zips
ZIPS_DIR.mkdir(exist_ok = True, parents = True)

#Spam-Classifier folder: START
SPAM_CLASSIFIER_DIR = DATASETS_DIR / 'Spam-Classifier'

SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Sms-Spam'
SMS_SPAM_DIR.mkdir(exist_ok = True, parents = True)

YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Youtube-Spam'
YOUTUBE_SPAM_DIR.mkdir(exist_ok = True, parents = True)
#Spam-Classifier folder : END

#Exports folder: START
EXPORT_DIR = DATASETS_DIR / 'Exports'
EXPORT_DIR.mkdir(exist_ok = True, parents = True)
SPAM_DATASETS_DIR = EXPORT_DIR / 'Spam_Dataset.csv'
METADATA_EXPORT_PATH = EXPORT_DIR / 'Spam-Metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'Spam-Tokenizer.json'
#Exports folder: END

In [11]:
data = {}

#Opening META_EXPORT_PATH datafile called Spam-Metadata.pkl
#USing pickle to load dataset and storing data in dictonary called data.
with open (METADATA_EXPORT_PATH, 'rb') as f:
    data = pickle.load(f)


In [12]:
X_train = data['X_train']
X_test  = data['X_test']
y_train = data['y_train']
y_test  = data['y_test']
max_words = data['max_words']
max_seq_len = data['max_seq_len']
label_legend = data['label_legend']
label_legend_inverted = data['label_legend_inverted']
tokenizer = data['tokenizer']

In [13]:
import json

In [14]:
data_json = {}

#Opening TOKENIZER_EXPORT_PATH datafile called Spam-Tokenizer.json
#USing json to load dataset and storing data in dictonary called data_json.
with open (TOKENIZER_EXPORT_PATH, 'rb') as f:
    data_json = json.load(f)


In [15]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential 

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_words, embed_dim, input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 280, 128)          35840     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 280, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 291,034
Trainable params: 291,034
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
#Starting actual training

batch_size = 32
epochs = 5
 
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size = batch_size, verbose = 1, epochs = epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x134026580>

In [18]:
MODEL_EXPORT_PATH = EXPORT_DIR / 'Spam_Model.h5'
model.save(str(MODEL_EXPORT_PATH))

In [19]:
import numpy as np
#Predicting data: 

def predict(text_str, max_words = 280, max_sequences = 280, tokenizer = None):
    if not tokenizer:
        return None
    sequences   = tokenizer.texts_to_sequences([text_str]) #Converting input text to sequences from tokenizer 
    x_input     = pad_sequences(sequences, maxlen = max_sequences) #Padding the x-input for formatting
    y_output    = model.predict(x_input) #passing in x-input in correct format and sequence to model.predict()
    
    #top_y_input = np.argmax(y_output) #Collecting index of largest value example: ([0.9837, 0.0167]), yields index 0
    preds = y_output[0]
   
    labeled_preds = [{f'{label_legend_inverted[str(i)]}': x} for i, x in enumerate(preds)]
    return labeled_preds

In [32]:
predict('Get a huge discount on TV by calling this number 93718738, and visit this webpage https://stackoverflow.com/questions/19537520/attributeerror-nonetype-object-has-no-attribute-lower-python', max_words = max_words, max_sequences = max_seq_len, tokenizer = tokenizer)

[{'ham': 0.05917783}, {'spam': 0.94082224}]