<a href="https://colab.research.google.com/github/KeremAydin98/machine-learning-with-python-projects/blob/main/sms_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2022-04-27 10:35:36--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2022-04-27 10:35:37 (5.97 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2022-04-27 10:35:37--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2022-04-27 10:35:37 (7.91 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [8]:
import pandas as pd

In [28]:
train_df = pd.read_csv(train_file_path, sep='\t',header=1).rename(columns={"ham":"target","you can never do nothing":"sentence"})
test_df = pd.read_csv(test_file_path, sep='\t',header=1).rename(columns={"ham":"target","you can never do nothing":"sentence"})

In [29]:
train_df

Unnamed: 0,target,sentence
0,ham,"now u sound like manky scouse boy steve,like! ..."
1,ham,mum say we wan to go then go... then she can s...
2,ham,never y lei... i v lazy... got wat? dat day ü ...
3,ham,in xam hall boy asked girl tell me the startin...
4,ham,genius what's up. how your brother. pls send h...
...,...,...
4172,ham,just woke up. yeesh its late. but i didn't fal...
4173,ham,what do u reckon as need 2 arrange transport i...
4174,spam,free entry into our £250 weekly competition ju...
4175,spam,-pls stop bootydelious (32/f) is inviting you ...


In [30]:
train_df.columns

Index(['target', 'sentence'], dtype='object')

### Train test split

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
features = train_df["sentence"]
labels = train_df["target"]

x_train, x_val, y_train, y_val = train_test_split(features, labels,test_size=0.3,random_state=42)

In [49]:
y_train = y_train.map({"ham":0,"spam":1})
y_val = y_val.map({"ham":0,"spam":1})

In [33]:
x_train[:5]

2830    i cant pick the phone right now. pls send a me...
925     hey mate. spoke to the mag people. we‘re on.  ...
3845                 will ü b going to esplanade fr home?
547     but really quite funny lor wat... then u shd h...
2259    free unlimited hardcore porn direct 2 your mob...
Name: sentence, dtype: object

### Tokenization

In [37]:
max_vocab_length = 1000
max_length = 15

In [38]:
import tensorflow as tf

In [39]:
text_vectorizer = tf.keras.layers.TextVectorization(
 max_tokens=max_vocab_length, #Maximum size of the vocabulary for this layer.
 output_mode='int',
 output_sequence_length=max_length)

In [40]:
text_vectorizer.adapt(x_train)

In [41]:
text_vectorizer("Hello darling")

<tf.Tensor: shape=(15,), dtype=int64, numpy=
array([336,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0])>

### Embedding

* input_dim = the size of our vocabulary
* output_dim = the size of the output embedding vector
* input_length = length of the sequences being passed to the embedding layer

In [42]:
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,output_dim = 128, input_length = max_length) # Turns positive integers (indexes) into dense vectors of fixed size.

### Create the model

In [44]:

inputs = tf.keras.layers.Input(shape=(1,),dtype=tf.string)

x = text_vectorizer(inputs)

x = embedding(x)

x = tf.keras.layers.LSTM(64, return_sequences=True)(x)

x = tf.keras.layers.LSTM(64)(x)

x = tf.keras.layers.Dense(64, activation = "relu")(x)

outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.models.Model(inputs, outputs)

model.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [45]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           128000    
                                                                 
 lstm_2 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                             

In [51]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [63]:
x_val.shape

(1254,)

In [56]:
model_preds_probs = model.predict(x_val)
model_preds_probs

array([[0.00379327],
       [0.00668374],
       [0.00336072],
       ...,
       [0.00302961],
       [0.34188503],
       [0.00179797]], dtype=float32)

In [58]:
model_preds = tf.squeeze(tf.round(model_preds_probs))
model_preds

<tf.Tensor: shape=(1254,), dtype=float32, numpy=array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)>

In [66]:
import numpy as np

In [72]:
classes = ["ham","spam"]

In [73]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):

  prediction_probs = model.predict(np.array([pred_text]))

  prediction_index = int(tf.squeeze(tf.round(prediction_probs)))

  prediction = classes[prediction_index]

  return prediction

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

ham


In [79]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "ham", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

You passed the challenge. Great job!
