<a href="https://colab.research.google.com/github/MannJadhav/Industry-Certification-/blob/main/freecodecamp_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#
# This code is a solution for the freeCodeCamp "Neural Network SMS Text Classifier" project.
#
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import requests
import zipfile
import io

# Step 1: Download and Prepare the Dataset
# -------------------------------------------
# URL of the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'

# Download and extract the file
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

# Load the dataset into a pandas DataFrame
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Encode the labels ('ham' -> 0, 'spam' -> 1)
encoder = LabelEncoder()
df['label_encoded'] = encoder.fit_transform(df['label'])

# Get messages and labels
messages = df['message'].values
labels = df['label_encoded'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)

# Step 2: Text Vectorization
# ----------------------------
vocab_size = 5000  # Number of unique words to consider
max_length = 100   # Max length of a message
embedding_dim = 16 # Dimension for word embedding

# Create a tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>") # <OOV> for out-of-vocabulary words
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences of integers
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure they all have the same length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Step 3: Build the Neural Network Model
# ----------------------------------------
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# Step 4: Train the Model
# -------------------------
num_epochs = 10
history = model.fit(
    train_padded,
    y_train,
    epochs=num_epochs,
    validation_data=(test_padded, y_test),
    verbose=2
)

# Step 5: Create the Prediction Function
# ----------------------------------------
# This is the function that the freeCodeCamp test will run
def predict_message(pred_text):
    # Preprocess the input text
    new_sequence = tokenizer.texts_to_sequences([pred_text])
    padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding='post', truncating='post')

    # Get the model's prediction
    prediction = model.predict(padded_sequence)[0][0]

    # Determine the label
    if prediction >= 0.5:
        label = "spam"
    else:
        label = "ham"

    return [prediction, label]

# Step 6: Test the function
# ---------------------------
# Create a test message
test_message_1 = "Congratulations! You've won a free ticket to the Bahamas. Click here to claim your prize."
test_message_2 = "Hey, are you available for a meeting tomorrow at 2 PM?"

prediction_1 = predict_message(test_message_1)
prediction_2 = predict_message(test_message_2)

print(f"Message: '{test_message_1}'")
print(f"Prediction: {prediction_1[1]} (Probability: {prediction_1[0]:.4f})\n")

print(f"Message: '{test_message_2}'")
print(f"Prediction: {prediction_2[1]} (Probability: {prediction_2[0]:.4f})")

# Run this cell in Colab to test your function
# The test function from freeCodeCamp will call the `predict_message` function
def test_predict_message():
  pred_text = "how are you doing today?"
  prediction = predict_message(pred_text)
  assert(type(prediction) == list), "predication should be a list"
  assert(len(prediction) == 2), "prediction list should have 2 elements"
  assert(type(prediction[0]) == np.float32), "prediction should be a float"
  assert(prediction[0] <= 1 and prediction[0] >= 0), "prediction should be a value between 0 and 1"
  assert(prediction[1] in ["ham", "spam"]), "prediction result should be 'ham' or 'spam'"
  return "All tests passed!"

print("\nRunning unit tests...")
print(test_predict_message())



Epoch 1/10
140/140 - 9s - 67ms/step - accuracy: 0.8652 - loss: 0.4388 - val_accuracy: 0.8664 - val_loss: 0.4009
Epoch 2/10
140/140 - 10s - 70ms/step - accuracy: 0.8658 - loss: 0.4150 - val_accuracy: 0.8664 - val_loss: 0.4043
Epoch 3/10
140/140 - 11s - 77ms/step - accuracy: 0.8658 - loss: 0.4113 - val_accuracy: 0.8664 - val_loss: 0.3938
Epoch 4/10
140/140 - 11s - 77ms/step - accuracy: 0.8658 - loss: 0.4082 - val_accuracy: 0.8664 - val_loss: 0.3934
Epoch 5/10
140/140 - 6s - 42ms/step - accuracy: 0.8658 - loss: 0.4124 - val_accuracy: 0.8664 - val_loss: 0.3944
Epoch 6/10
140/140 - 11s - 79ms/step - accuracy: 0.8658 - loss: 0.4126 - val_accuracy: 0.8664 - val_loss: 0.3945
Epoch 7/10
140/140 - 9s - 68ms/step - accuracy: 0.8658 - loss: 0.4093 - val_accuracy: 0.8664 - val_loss: 0.3951
Epoch 8/10
140/140 - 11s - 75ms/step - accuracy: 0.8658 - loss: 0.4077 - val_accuracy: 0.8664 - val_loss: 0.3933
Epoch 9/10
140/140 - 7s - 48ms/step - accuracy: 0.8658 - loss: 0.4089 - val_accuracy: 0.8664 - val_