In [41]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.layers import Flatten, Dense, Conv2D
from tensorflow.keras.models import Sequential

import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

import random
import string

In [42]:
# load the dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    
    y = df['Category'] # y: 'Category' column
    y = np.array(y)
    
    x = df['Message'] #x: 'Message' column
    x = np.array(x)
    
    return df, x, y

In [43]:
df, x, y = load_dataset('SPAM text message 20170820 - Data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Pre-processing the raw data

In [44]:
# check NAs
df.isna().sum()

Category    0
Message     0
dtype: int64

In [45]:
# split the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [46]:
# check the split results
print(len(y_train))
print(len(y_test))

3900
1672


Machine learning algorithms and deep learning neural networks require that input and output variables are numbers.
This means that categorical data must be encoded to numbers before we can use it to fit and evaluate a model.

In [47]:
# convert string in "Category" column to numerical binary encoding
# reference: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

def label_encoder(data):
    le = preprocessing.LabelEncoder()
    le.fit(data) # first fit lable encoder
    enc = le.transform(data) # then transform labels to normalized encoding
    return enc

# y_test_enc = le.transform(y_test)

In [48]:
y_train_enc = label_encoder(y_train)
y_test_enc = label_encoder(y_test)

In [49]:
# check the encoding results
print(y_train[:20])
print(y_train_enc[:20])

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]


## Tokenizing the cleaned data

In [50]:
# apply tfidf vectorization

tfidfvectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
x_train_vect = tfidfvectorizer.fit_transform(x_train)
x_test_vect = tfidfvectorizer.fit_transform(x_test)

# tfidfvectorizer = TfidfVectorizer(analyzer='word', stop_words='english').fit(x_train)
# x_train_vect = tfidfvectorizer.transform(x_train)

In [93]:
print(x_train[5])

Think I could stop by in like an hour or so? My roommate's looking to stock up for a trip


In [51]:
print(x_train_vect[:5])

  (0, 2221)	0.4595576461719437
  (0, 6907)	0.42057032296489166
  (0, 260)	0.3977642008852706
  (0, 943)	0.35285068394560465
  (0, 3629)	0.33392519134134463
  (0, 3634)	0.31674844532481095
  (0, 5025)	0.3425956765507876
  (1, 5087)	0.564752315585174
  (1, 4329)	0.4556948018194525
  (1, 6399)	0.6880385669683886
  (2, 5909)	0.4598907748288949
  (2, 1882)	0.5577039025555312
  (2, 5320)	0.407962240134966
  (2, 1557)	0.5577039025555312
  (3, 2032)	0.5517950786382567
  (3, 1762)	0.4550182724182232
  (3, 6157)	0.39464721758363247
  (3, 889)	0.4179976347325351
  (3, 5392)	0.3975073759914889
  (4, 3168)	0.517613431846439
  (4, 5675)	0.6128580677760653
  (4, 2864)	0.40046755880208973
  (4, 6864)	0.44283976592107693


In [52]:
print(x_train.shape)
print(x_train_vect.shape)

(3900,)
(3900, 7003)


In [53]:
print(y_train.shape)
print(y_train_enc.shape)

(3900,)
(3900,)


## Build the model

In [54]:
input_dim = x_train_vect.shape[1]
batch_size = 100

In [55]:
# def build_sequential_model():
#     seq_model = tf.keras.Sequential([
#         Dense(64, input_dim=(1,), activation='relu'),
#         Dense(32, activation='relu'),
#         Dense(16, activation='relu'),
#         Dense(1, activation='sigmoid')
#     ])
#     return seq_model

In [56]:
model=tf.keras.Sequential()
model.add(Dense(16, input_dim=input_dim, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [57]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 16)                112064    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 112,081
Trainable params: 112,081
Non-trainable params: 0
_________________________________________________________________


In [58]:
history = model.fit(x_train_vect, y_train_enc,
                    epochs=5,
                    verbose=True,
                    validation_data=(x_test_vect, y_test_enc),
                    batch_size=100)

ValueError: Error when checking input: expected dense_2_input to have shape (7003,) but got array with shape (4183,)