In [33]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, concatenate
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import EarlyStopping

In [34]:
import numpy as np
import pandas as pd
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense, concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Load Spacy model
import en_core_web_sm
#nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm')

In [35]:
data = pd.read_csv(r"C:\Users\getty\Desktop\my data.csv")

In [36]:
#Data preprocessing
import nltk
from nltk.corpus import stopwords
#nltk.download('punkt')
nltk.download('stopwords')
#nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\getty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
data.loc[data["label"] == "Spam", "label"] = 1.0
data.loc[data["label"] == "Non-Spam", "label"] = 0.0

In [38]:
x = data["text"]
y = data["label"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [1]:
y_train = np.asarray(y_train).astype("float64")
y_test = np.asarray(y_test).astype("float64")

NameError: name 'np' is not defined

In [40]:
embedding_dim = 100
num_filters = 128
dropout_rate = 0.5

In [41]:
# Parameters
max_words = 10000
max_len = 200
max_chars = 200
embedding_dim = 128  # example value
num_filters = 64    # example value
dropout_rate = 0.5  # example value

# Assume x_train and x_test are already defined and preprocessed

In [42]:
def extract_content_features(text):
    doc = nlp(text)
    syntax = len([token.dep_ for token in doc])
    semantic = len([token.ent_type_ for token in doc])
    length_of_text = len(text)
    presence_of_hyperlinks = int('http' in text or 'www' in text)
    return [syntax, semantic, length_of_text, presence_of_hyperlinks]

# Extract content-based features for training and testing sets
x_train_content_features = np.array([extract_content_features(text) for text in x_train])
x_test_content_features = np.array([extract_content_features(text) for text in x_test])

In [43]:
# Function to extract character-based features
def extract_character_features(text):
    num_chars = len(text)
    repeated_chars = len([char for char in text if text.count(char) > 1])
    capitalized_words = len([word for word in text.split() if word.isupper()])
    freq_specific_chars = len([char for char in text if char in '@#$%^&*'])
    return [num_chars, repeated_chars, capitalized_words, freq_specific_chars]

# Extract character-based features for training and testing sets
x_train_char_features = np.array([extract_character_features(text) for text in x_train])
x_test_char_features = np.array([extract_character_features(text) for text in x_test])

In [44]:
# Tokenization and padding for text sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_train_padded = pad_sequences(x_train_seq, maxlen=max_len)

x_test_seq = tokenizer.texts_to_sequences(x_test)
x_test_padded = pad_sequences(x_test_seq, maxlen=max_len)

# Tokenization and padding for character sequences
def char_tokenizer(text):
    return [ord(char) for char in text]

x_train_char_seq = x_train.apply(char_tokenizer)
x_test_char_seq = x_test.apply(char_tokenizer)

x_train_char_padded = pad_sequences(x_train_char_seq, maxlen=max_chars)
x_test_char_padded = pad_sequences(x_test_char_seq, maxlen=max_chars)

In [45]:
# Ensuring the data types are consistent
x_train_padded = np.asarray(x_train_padded.astype("float64"))
x_test_padded = np.asarray(x_test_padded.astype("float64"))
x_train_char_padded = np.asarray(x_train_char_padded.astype("float64"))
x_test_char_padded = np.asarray(x_test_char_padded.astype("float64"))

In [46]:
#Repeat the 7 samples in x_train_char_padded to match the 11156 samples in x_train_padded
num_repeats = len(x_train_padded) // len(x_train_char_padded)
x_train_char_padded_repeated = np.repeat(x_train_char_padded, num_repeats, axis=0)

# Repeat the samples in x_train_char_padded to match 11151 samples in x_train_padded
num_additional_samples = len(x_train_padded) - len(x_train_char_padded)
additional_samples_indices = np.random.choice(len(x_train_char_padded), num_additional_samples)
x_train_char_padded_adj = np.concatenate((x_train_char_padded, x_train_char_padded[additional_samples_indices]), axis=0)

In [47]:
# Repeat the 7 samples in x_train_char_padded to match the 11156 samples in x_train_padded
num_repeats = len(x_test_padded) // len(x_test_char_padded)
x_test_char_padded_repeated = np.repeat(x_test_char_padded, num_repeats, axis=0)

# Repeat the samples in x_train_char_padded to match 11151 samples in x_train_padded
num_additional_samples = len(x_test_padded) - len(x_test_char_padded)
additional_samples_indices = np.random.choice(len(x_test_char_padded), num_additional_samples)
x_test_char_padded_adj = np.concatenate((x_test_char_padded, x_test_char_padded[additional_samples_indices]), axis=0)

In [48]:
# Merging content and character-based features with the padded sequences
x_train_combined = np.hstack([x_train_padded, x_train_content_features])
x_test_combined = np.hstack([x_test_padded, x_test_content_features])

# Model definitions
content_model = Sequential()
content_model.add(Embedding(max_words, embedding_dim, input_length=max_len))
content_model.add(Conv1D(num_filters, 5, activation='relu'))
content_model.add(GlobalMaxPooling1D())
content_model.add(Dropout(dropout_rate))

char_model = Sequential()
char_model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, input_length=max_chars))
char_model.add(Conv1D(num_filters, 5, activation='relu'))
char_model.add(GlobalMaxPooling1D())
char_model.add(Dropout(dropout_rate))

In [49]:
# Merging models
merged = concatenate([content_model.output, char_model.output])
merged = Dense(128, activation='relu')(merged)
merged = Dropout(dropout_rate)(merged)
output = Dense(1, activation='sigmoid')(merged)

# Final model
model = tf.keras.Model(inputs=[content_model.input, char_model.input], outputs=output)

# Compilation
learning_rate = 0.001
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Output model summary
#model.summary()


In [50]:
classes = np.unique(y_train)

# Computing class weights manually
class_weights = {cls: len(y_train) / (len(classes) * (y_train == cls).sum()) for cls in classes}


In [51]:
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss (or choose 'val_accuracy' if you prefer)
    min_delta=0.001,  # Minimum change to qualify as an improvement
    patience=10,  # How many epochs to wait after last time validation loss improved
    verbose=1,
    mode='min',  # 'min' mode means training will stop when the quantity monitored has stopped decreasing
    restore_best_weights=False  # Whether to restore model weights from the epoch with the best value of the monitored quantity.
)

In [54]:
batch_size = 64
model.fit(
    [x_train_padded, x_train_char_padded_adj],  # Training data
    y_train,  # Training labels
    batch_size=batch_size,
    epochs=100,  # Set to a high number since training will stop automatically
    validation_data=([x_test_padded, x_test_char_padded_adj], y_test),  # or use validation_data=(x_val, y_val)
    callbacks=[early_stopping],class_weight = class_weights  # Include the EarlyStopping callback
)

loss, accuracy = model.evaluate([x_test_padded, x_test_char_padded_adj], y_test, batch_size=batch_size)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: early stopping
Test Loss: 0.01968015357851982
Test Accuracy: 0.9971325993537903


In [27]:
pred_probs = model.predict([x_test_padded, x_test_char_padded_adj], batch_size=batch_size)



In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Assuming x_test_padded and x_test_char_padded_adj are your test inputs
# Get model's probability predictions for the positive class
pred_probs = model.predict([x_test_padded, x_test_char_padded_adj])
# Convert probabilities to binary predictions using 0.5 as the threshold
pred_labels = np.where(pred_probs > 0.5, 1, 0)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels)
f1 = f1_score(y_test, pred_labels)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:",f1)

Precision: 0.9985315712187959
Recall: 0.9956076134699854
F1 Score: 0.9970674486803519


In [30]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np


# Generate predictions for the test data
pred_probs = model.predict([x_test_padded, x_test_char_padded_adj], batch_size=batch_size)
# Convert probabilities to class predictions (for multiclass, assuming one-hot encoding)
pred = np.argmax(pred_probs, axis=1)

# If your y_test is one-hot encoded, convert it back to class indices for comparison
if y_test.ndim > 1:
    y_test_indices = np.argmax(y_test, axis=1)
else:
    y_test_indices = y_test

# Calculate precision, recall, and F1 score
f1 = f1_score(y_test_indices, pred, average='weighted')  # Use 'binary' for binary classification, 'macro' for multiclass, average='macro'
precision = precision_score(y_test_indices, pred, average='weighted')
recall = recall_score(y_test_indices, pred,average='weighted')

# Print the calculated metrics
print("F1 Score:",f1)
print("Precision:",precision)
print("Recall:",recall)

F1 Score: 0.6498674120575614
Precision: 0.5703227091121645
Recall: 0.7551971326164875


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# Assuming x_train is your training data
text_data = ["Congratulations! You've won a free cruise. Click the link to claim your prize.",
             "Hi there, just checking in. How are you doing?","You are a great role model",
             "100 dating service cal;l 09064012103 box334sk38ch",
            "(Bank of Granite issues Strong-Buy) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300% *********** Nasdaq Symbol CDGT That is a $5.00 per..",
            "&lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.","pay 100 dolars"]

# Tokenize and pad the text data
tokenizer = Tokenizer()

sequences = tokenizer.texts_to_sequences(text_data)
max_len = 200  # Assuming a max sequence length of 200
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Convert characters in text to ordinals
max_chars = 200  # Assuming a max character length of 200
char_tokenizer_wrapper = lambda x: [ord(char) for char in x]
text_char_seq = list(map(char_tokenizer_wrapper, text_data))
text_char_padded = pad_sequences(text_char_seq, maxlen=max_chars)

# Repeat the samples in text_char_padded to match the number of samples in padded_sequences
num_repeats = len(padded_sequences) // len(text_char_padded)
text_char_padded_repeated = np.repeat(text_char_padded, num_repeats, axis=0)

# Adjust the number of samples in text_char_padded to match the number of samples in padded_sequences
num_additional_samples = len(padded_sequences) - len(text_char_padded)
additional_samples_indices = np.random.choice(len(text_char_padded), num_additional_samples)
text_char_padded_adj = np.concatenate((text_char_padded, text_char_padded[additional_samples_indices]), axis=0)

# Assuming you have the model loaded here

# Make predictions using the loaded model
predictions = model.predict([padded_sequences, text_char_padded_adj])

# Print the predictions for each input text
for i, text in enumerate(text_data):
    print(f'Text: {text} - Spam Probability: {predictions[i]}')
    
    if predictions[i] <= 0.2:
        print('Non-Spam')
    else:
        print('Spam')

Text: Congratulations! You've won a free cruise. Click the link to claim your prize. - Spam Probability: [0.02585566]
Non-Spam
Text: Hi there, just checking in. How are you doing? - Spam Probability: [0.00173024]
Non-Spam
Text: You are a great role model - Spam Probability: [0.00946098]
Non-Spam
Text: 100 dating service cal;l 09064012103 box334sk38ch - Spam Probability: [0.9998997]
Spam
Text: (Bank of Granite issues Strong-Buy) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300% *********** Nasdaq Symbol CDGT That is a $5.00 per.. - Spam Probability: [0.7424263]
Spam
Text: &lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF. - Spam Probability: [0.00232021]
Non-Spam
Text: pay 100 dolars - Spam Probability: [0.655978]
Spam
