In [202]:
import pathlib
import pandas as pd
import random 

BASE_DIR = pathlib.Path().resolve().parent #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API
SMS_SPAM_DIR = pathlib.Path().resolve()    #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/SMS-SPAM
DATASETS_DIR = BASE_DIR / 'Datasets' #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets

ZIPS_DIR = DATASETS_DIR / 'Zips'     #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets/Zips
ZIPS_DIR.mkdir(exist_ok = True, parents = True)

#Spam-Classifier folder: START
SPAM_CLASSIFIER_DIR = DATASETS_DIR / 'Spam-Classifier'

SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Sms-Spam'
SMS_SPAM_DIR.mkdir(exist_ok = True, parents = True)

YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Youtube-Spam'
YOUTUBE_SPAM_DIR.mkdir(exist_ok = True, parents = True)
#Spam-Classifier folder : END

#Exports folder: START
EXPORT_DIR = DATASETS_DIR / 'Exports'
EXPORT_DIR.mkdir(exist_ok = True, parents = True)
SPAM_DATASETS_DIR = EXPORT_DIR / 'Spam_Dataset.csv'
METADATA_EXPORT_PATH = EXPORT_DIR / 'Spam-Metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'Spam-Tokenizer.json'
#Exports folder: END

In [203]:
df = pd.read_csv(SPAM_DATASETS_DIR)
df

Unnamed: 0,CONTENT,CLASS,LABEL,SOURCE
0,"Go until jurong point, crazy.. Available only ...",0,ham,sms-spam
1,Ok lar... Joking wif u oni...,0,ham,sms-spam
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,spam,sms-spam
3,U dun say so early hor... U c already then say...,0,ham,sms-spam
4,"Nah I don't think he goes to usf, he lives aro...",0,ham,sms-spam
...,...,...,...,...
7523,How can this have 2 billion views when there's...,0,ham,Spam-youtube
7524,I don't now why I'm watching this in 2014﻿,0,ham,Spam-youtube
7525,subscribe to me for call of duty vids and give...,1,spam,Spam-youtube
7526,hi guys please my android photo editor downloa...,1,spam,Spam-youtube


In [204]:
#Declaring both CONTENT and CLASS to list for easy interpreting with AI
CONTENT = df['CONTENT'].tolist() 
CLASS = df['CLASS'].tolist()
LABEL = df['LABEL'].tolist()

#Classifying them in dict for later use. 
label_legend = {'ham': 0, 'spam': 1} 
label_legend_inverted = {f"{v}": k for k, v in label_legend.items()}


In [205]:
random_idx = random.randint(0, len(CLASS))


#Assert test to see if indexing is correct displaced through df dataframe. 
#Testing both CONTENT and LABEL\CLASS settings. 
assert CONTENT[random_idx] == df.iloc[random_idx].CONTENT
assert LABEL[random_idx] == df.iloc[random_idx].LABEL
assert CLASS[random_idx] == df.iloc[random_idx].CLASS


In [206]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [207]:
MAX_NUM_WORDS = 280

In [208]:
tokenizer = Tokenizer (num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(CONTENT)
sequences = tokenizer.texts_to_sequences(CONTENT)

random_idx = random.randint(0, len(CLASS))
list_1 = sequences[random_idx]

print(CONTENT[random_idx])
print(sequences[random_idx])


Check out this video on YouTube:﻿
[23, 17, 11, 57, 16, 73, 19]


In [209]:
word_index = tokenizer.word_index

In [210]:

listOfKeys = []
for i in list_1:
    for (key, value) in word_index.items():
        if value == i:
            listOfKeys.append(key)      
print(listOfKeys)


['check', 'out', 'this', 'video', 'on', 'youtube', '\ufeff']


In [211]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [212]:
MAX_SEQ_LEN = 280

In [213]:
#Creating X-data by using pad_sequences - For matching length of text and vectorizing them according
#to number of iterations a word is used. 
X = pad_sequences(sequences, maxlen = MAX_SEQ_LEN) #Features

In [214]:
from tensorflow.keras.utils import to_categorical
import numpy as np

In [215]:
CLASS_INT = np.asarray(CLASS)
CLASS_INT
y = to_categorical(CLASS_INT) #Outputs

In [216]:
from sklearn.model_selection import train_test_split

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state= 42)

In [218]:
import pickle

In [220]:
training_data = {
    'X_train':X_train,
    'X_test':X_test,
    'y_train':y_train,
    'y_test':y_test, 
    'max_words': MAX_NUM_WORDS, 
    'max_seq_len': MAX_SEQ_LEN, 
    'label_legend': label_legend,
    'label_legend_inverted': label_legend_inverted,
    'tokenizer': tokenizer 
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [221]:
with open (METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)
