In [24]:
import pathlib
import pandas as pd
import random 

BASE_DIR = pathlib.Path().resolve().parent #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API
SMS_SPAM_DIR = pathlib.Path().resolve()    #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/SMS-SPAM
DATASETS_DIR = BASE_DIR / 'Datasets' #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets

ZIPS_DIR = DATASETS_DIR / 'Zips'     #Check: /Users/kristoffervarslott/Documents/Python.py/AI_API/Datasets/Zips
ZIPS_DIR.mkdir(exist_ok = True, parents = True)

#Spam-Classifier folder: START
SPAM_CLASSIFIER_DIR = DATASETS_DIR / 'Spam-Classifier'

SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Sms-Spam'
SMS_SPAM_DIR.mkdir(exist_ok = True, parents = True)

YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / 'Youtube-Spam'
YOUTUBE_SPAM_DIR.mkdir(exist_ok = True, parents = True)
#Spam-Classifier folder : END

#Exports folder: START
EXPORT_DIR = DATASETS_DIR / 'Exports'
EXPORT_DIR.mkdir(exist_ok = True, parents = True)
SPAM_DATASETS_DIR = EXPORT_DIR / 'Spam_Dataset.csv'
#Exports folder: END

In [25]:
df = pd.read_csv(SPAM_DATASETS_DIR)
df

Unnamed: 0,CONTENT,CLASS,LABEL,SOURCE
0,"Go until jurong point, crazy.. Available only ...",0,ham,sms-spam
1,Ok lar... Joking wif u oni...,0,ham,sms-spam
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,spam,sms-spam
3,U dun say so early hor... U c already then say...,0,ham,sms-spam
4,"Nah I don't think he goes to usf, he lives aro...",0,ham,sms-spam
...,...,...,...,...
7523,How can this have 2 billion views when there's...,0,ham,Spam-youtube
7524,I don't now why I'm watching this in 2014﻿,0,ham,Spam-youtube
7525,subscribe to me for call of duty vids and give...,1,spam,Spam-youtube
7526,hi guys please my android photo editor downloa...,1,spam,Spam-youtube


In [48]:
#Declaring both CONTENT and CLASS to list for easy interpreting with AI
CONTENT = df['CONTENT'].tolist() 
CLASS = df['CLASS'].tolist()
LABEL = df['LABEL'].tolist()

label_legend = {'ham': 0, 'spam': 1}
label_legend_inverted = {f"{v}": k for k, v in label_legend.items()}


In [62]:
random_idx = random.randint(0, len(CLASS))


#Assert test to see if indexing is correct displaced through df dataframe. 
#Testing both CONTENT and LABEL\CLASS settings. 
assert CONTENT[random_idx] == df.iloc[random_idx].CONTENT
assert LABEL[random_idx] == df.iloc[random_idx].LABEL
assert CLASS[random_idx] == df.iloc[random_idx].CLASS


In [63]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [68]:
MAX_NUM_WORDS = 280

In [91]:
tokenizer = Tokenizer (num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(CONTENT)
sequences = tokenizer.texts_to_sequences(CONTENT)

In [71]:
word_index = tokenizer.word_index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'and': 6,
 'my': 7,
 'u': 8,
 'is': 9,
 'in': 10,
 'this': 11,
 'me': 12,
 'it': 13,
 'for': 14,
 'of': 15,
 'on': 16,
 'out': 17,
 'your': 18,
 '\ufeff': 19,
 'have': 20,
 'so': 21,
 'that': 22,
 'check': 23,
 'are': 24,
 '2': 25,
 'call': 26,
 'if': 27,
 'but': 28,
 'can': 29,
 'just': 30,
 'now': 31,
 'not': 32,
 'be': 33,
 'at': 34,
 'will': 35,
 'do': 36,
 'or': 37,
 'like': 38,
 'get': 39,
 'with': 40,
 'up': 41,
 "i'm": 42,
 'we': 43,
 'no': 44,
 'love': 45,
 'ur': 46,
 'from': 47,
 'please': 48,
 'all': 49,
 'com': 50,
 'lt': 51,
 'gt': 52,
 'how': 53,
 'when': 54,
 'go': 55,
 '4': 56,
 'video': 57,
 'know': 58,
 'free': 59,
 'am': 60,
 'what': 61,
 'good': 62,
 'was': 63,
 'ok': 64,
 'time': 65,
 'only': 66,
 'then': 67,
 'got': 68,
 'its': 69,
 'song': 70,
 'come': 71,
 '39': 72,
 'youtube': 73,
 'new': 74,
 'br': 75,
 'as': 76,
 'there': 77,
 'day': 78,
 'want': 79,
 'he': 80,
 'one': 81,
 'www': 82,
 'by': 83,
 'amp': 84,
 

In [72]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [73]:
MAX_SEQ_LEN = 300

In [78]:
X = pad_sequences(sequences, maxlen = MAX_SEQ_LEN)

In [82]:
print(X)

[[  0   0   0 ...  77  68 187]
 [  0   0   0 ...   0  64   8]
 [  0   0   0 ...   2 110 104]
 ...
 [  0   0   0 ...  15   6 137]
 [  0   0   0 ... 180  50  50]
 [  0   0   0 ... 190 241  19]]


In [84]:
from tensorflow.keras.utils import to_categorical
import numpy as np

In [88]:
CLASS_ARRAY = np.asarray(CLASS)
CLASS_ARRAY

array([0, 0, 1, ..., 1, 1, 0])

In [89]:
y = to_categorical(CLASS_ARRAY)

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)