In [164]:
import pathlib
import pandas as pd
import random 

BASE_DIR = pathlib.Path().resolve().parent #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API
SMS_SPAM_DIR = pathlib.Path().resolve()    #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/SMS-SPAM
DATASETS_DIR = BASE_DIR / 'Datasets' #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets

ZIPS_DIR = DATASETS_DIR / 'Zips'     #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets/Zips
ZIPS_DIR.mkdir(exist_ok = True, parents = True)

#Spam-Classifier folder: START
SPAM_CLASSIFIER_DIR = DATASETS_DIR / 'Spam-Classifier'

SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Sms-Spam'
SMS_SPAM_DIR.mkdir(exist_ok = True, parents = True)

YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Youtube-Spam'
YOUTUBE_SPAM_DIR.mkdir(exist_ok = True, parents = True)
#Spam-Classifier folder : END

#Exports folder: START
EXPORT_DIR = DATASETS_DIR / 'Exports'
EXPORT_DIR.mkdir(exist_ok = True, parents = True)
SPAM_DATASETS_DIR = EXPORT_DIR / 'Spam_Dataset.csv'
METADATA_EXPORT_PATH = EXPORT_DIR / 'Spam-Metadata.json'
METADATA_EXPORT_PATH_pkl = EXPORT_DIR / 'Spam-Metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'Spam-Tokenizer.json'
#Exports folder: END

In [165]:
df = pd.read_csv(SPAM_DATASETS_DIR)


In [166]:
#Declaring both CONTENT and CLASS to list for easy interpreting with AI
CONTENT = df['CONTENT'].tolist() 
CLASS = df['CLASS'].tolist()
LABEL = df['LABEL'].tolist()

#Classifying them in dict for later use. - Storing in METADATA.json
label_legend = {'ham': 0, 'spam': 1} 
label_legend_inverted = {f"{v}": k for k, v in label_legend.items()}


In [167]:
random_idx = random.randint(0, len(CLASS))


#Assert test to see if indexing is correct displaced through df dataframe. 
#Testing both CONTENT and LABEL\CLASS settings. 
assert CONTENT[random_idx] == df.iloc[random_idx].CONTENT
assert LABEL[random_idx]   == df.iloc[random_idx].LABEL
assert CLASS[random_idx]   == df.iloc[random_idx].CLASS

In [168]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [169]:
MAX_NUM_WORDS = 1000

In [170]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(CONTENT)
sequences = tokenizer.texts_to_sequences(CONTENT)

random_idx = random.randint(0, len(CLASS))
list_1 = sequences[random_idx]

print(CONTENT[random_idx])
print(sequences[random_idx])

Hey what how about your project. Started aha da.
[94, 61, 53, 89, 18, 863, 403, 130]


In [171]:
word_index = tokenizer.word_index

In [172]:

listOfKeys = []
for i in list_1:
    for (key, value) in word_index.items():
        if value == i:
            listOfKeys.append(key)      
print(listOfKeys)


['hey', 'what', 'how', 'about', 'your', 'project', 'started', 'da']


In [173]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [174]:
MAX_SEQ_LEN = 500

In [175]:
#Creating X-data by using pad_sequences - For matching length of text and vectorizing them according
#to number of iterations a word is used. 
X = pad_sequences(sequences, maxlen = MAX_SEQ_LEN) #Features


In [176]:
from tensorflow.keras.utils import to_categorical
import numpy as np

In [177]:
CLASS_INT = np.asarray(CLASS)
y = to_categorical(CLASS_INT) #Outputs

In [178]:
from sklearn.model_selection import train_test_split

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state= 42)

In [180]:
import pickle

In [182]:
training_data = {
    'X_train':X_train,
    'X_test':X_test,
    'y_train':y_train,
    'y_test':y_test, 
    'max_words': MAX_NUM_WORDS, 
    'max_seq_len': MAX_SEQ_LEN, 
    'label_legend': label_legend,
    'label_legend_inverted': label_legend_inverted,
    'tokenizer': tokenizer 
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json) #Loading TOKENIZER DATA TO CORRECT PATH AS JSON

1090336

In [183]:
METADATA = {
    'label_legend_inverted': label_legend_inverted,
    'label_legend': label_legend,
    'max_words': MAX_NUM_WORDS, 
    'max_seq_len': MAX_SEQ_LEN, 
}
METADATA

{'label_legend_inverted': {'0': 'ham', '1': 'spam'},
 'label_legend': {'ham': 0, 'spam': 1},
 'max_words': 1000,
 'max_seq_len': 500}

In [184]:
import json

In [189]:
with open (METADATA_EXPORT_PATH, "w", encoding="utf8") as f:
    json.dump(METADATA, f)


with open(METADATA_EXPORT_PATH_pkl, 'wb') as f:
    pickle.dump(training_data, f)

