In [10]:
pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
import csv
import os
crtDir = os.getcwd()
fileName = os.path.join(crtDir, 'data', 'spam.csv')

data = []
with open(fileName, encoding='latin-1') as csv_file:  # Specify the correct encoding
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            dataNames = row
        else:
            data.append(row)
        line_count += 1


inputs = [data[i][0] for i in range(len(data))][:100]
outputs = [data[i][1] for i in range(len(data))][:100]
labelNames = list(set(outputs))

print(inputs[:2])
print(labelNames[:2])


        

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...']
['spam', 'ham']


In [12]:
import numpy as np

np.random.seed(5)
noSamples = len(inputs)
indexes = [i for i in range(noSamples)]
trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace= False)
testSample = [i for i in indexes if not i in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [inputs[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]




In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
print("\n TF-IDF Feature Extract")
vectorizer = TfidfVectorizer(max_features=50)
trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.transform(testInputs)
print(f"Vocabular size with TF-IDF: {len(vectorizer.vocabulary_)} words")
print(f"Feature matrix shape: {trainFeatures.shape}")
print(f"Some vocabulary words: {vectorizer.get_feature_names_out()[:10]}")



 TF-IDF Feature Extract
Vocabular size with TF-IDF: 50 words
Feature matrix shape: (80, 50)
Some vocabulary words: ['all' 'already' 'and' 'anything' 'at' 'be' 'but' 'call' 'can' 'did']


In [14]:
print("Word To Vec Feature Extraction")

import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

nltk.download('punkt', quiet=True)
def preprocess_text(texts):
    tokenized_texts = []
    for text in texts:
        tokens = word_tokenize(text.lower())
        tokenized_texts.append(tokens)
    return tokenized_texts

tokenized_train = preprocess_text(trainInputs)
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc, model):
    words = [word for word in doc if word in model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in words], axis=0)


train_vectors = [document_vector(doc, w2v_model) for doc in tokenized_train]
tokenized_test = preprocess_text(testInputs)
test_vectors = [document_vector(doc, w2v_model) for doc in tokenized_test]

train_vectors_w2v = np.array(train_vectors)
test_vectors_w2v = np.array(test_vectors)

print(f"Word2Vec features shape: {train_vectors_w2v.shape}")
print(f"Sample vector: {train_vectors_w2v[0][:5]}...")



Word To Vec Feature Extraction
Word2Vec features shape: (80, 100)
Sample vector: [ 0.00120308  0.00019636 -0.00106943 -0.00109585  0.00257067]...


In [15]:
print("Extra Text Features")

nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
use_nltk_sentiment = True


def extract_text_features(texts):
    features = []
    for text in texts:
        length = len(text)
        word_count = len(text.split())
        avg_word_length = length / max(1, word_count)

        uppercase_ratio = sum(1 for c in text if c.isupper()) / max(1, len(text))
        digit_ratio = sum(1 for c in text if c.isdigit()) / max(1, len(text))
        special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace()) / max(1, len(text))

        exclamation_count = text.count('!')
        question_count = text.count('?')

        has_url = 1 if 'http' in text.lower() or 'www' in text.lower() else 0
        has_urgent = 1 if 'urgent' in text.lower() or 'now' in text.lower() else 0

        feature_list = [length, word_count, avg_word_length, uppercase_ratio, 
                        digit_ratio, special_chars, exclamation_count, question_count,
                        has_url, has_urgent]
        
        if use_nltk_sentiment:
            sentiment = sia.polarity_scores(text)
            feature_list.extend([sentiment['pos'], sentiment['neu'], sentiment['neg']])
        features.append(feature_list)
    
    return np.array(features)


train_additional_features = extract_text_features(trainInputs)
test_additional_features = extract_text_features(testInputs)

print(f"Additional features shape: {train_additional_features.shape}")
print(f"Sample additional features: {train_additional_features[0]}")

Extra Text Features
Additional features shape: (80, 13)
Sample additional features: [1.24000000e+02 2.30000000e+01 5.39130435e+00 2.41935484e-02
 0.00000000e+00 8.06451613e-02 0.00000000e+00 1.00000000e+00
 0.00000000e+00 0.00000000e+00 1.68000000e-01 8.32000000e-01
 0.00000000e+00]


In [16]:
print("SCIKIT CLASSIFICATION")

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(trainOutputs)
test_labels = label_encoder.fit_transform(testOutputs)

train_features_combined = np.hstack((trainFeatures.toarray(), train_vectors_w2v, train_additional_features))
test_features_combined = np.hstack((testFeatures.toarray(), test_vectors_w2v, test_additional_features))

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features_combined)
test_features_scaled = scaler.fit_transform(test_features_combined)

print(f"Training MLP classifier: {train_features_scaled.shape}")
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', max_iter=500, random_state=42)
mlp.fit(train_features_scaled, train_labels)

predictions = mlp.predict(test_features_scaled)
accuracy = accuracy_score(test_labels, predictions)

print(f"Accuracy with scikit-learn MLP: {accuracy:.4f}")
print("\n Report:")
print(classification_report(test_labels, predictions, target_names=labelNames))


SCIKIT CLASSIFICATION
Training MLP classifier: (80, 163)
Accuracy with scikit-learn MLP: 0.9500

 Report:
              precision    recall  f1-score   support

        spam       0.94      1.00      0.97        17
         ham       1.00      0.67      0.80         3

    accuracy                           0.95        20
   macro avg       0.97      0.83      0.89        20
weighted avg       0.95      0.95      0.95        20



In [18]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score,classification_report
from MyANN import MyMLPClassifier
train_features_combined = np.hstack((trainFeatures.toarray(), train_vectors_w2v, train_additional_features))
test_features_combined = np.hstack((testFeatures.toarray(), test_vectors_w2v, test_additional_features))

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features_combined)
test_features_scaled = scaler.transform(test_features_combined)

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(trainOutputs)
test_labels = label_encoder.transform(testOutputs)

print(f"\nTraining custom MLP classifier with input shape: {train_features_scaled.shape}")
my_mlp = MyMLPClassifier(
    hidden_layer_sizes=(100, 50),  
    activation='relu',             
    learning_rate_init=0.001,     
    max_iter=100,                  
    random_state=42,              
    verbose=True                   
)

my_mlp.fit(train_features_scaled, train_labels)

predictions = my_mlp.predict(test_features_scaled).flatten()

accuracy = accuracy_score(test_labels, predictions)
print(f"\nAccuracy with custom ANN: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(test_labels, predictions, target_names=labelNames))

print("\nSample Predictions:")
for i in range(min(5, len(testInputs))):
    text = testInputs[i]
    true_label = testOutputs[i]
    predicted_label = labelNames[predictions[i]]
    print(f"Text: {text[:50]}...")
    print(f"True label: {true_label}, Predicted label: {predicted_label}")
    print("-" * 50)


Training custom MLP classifier with input shape: (80, 163)
Iteration 0, Loss: 8.111504

Accuracy with custom ANN: 0.6500

Classification Report:
              precision    recall  f1-score   support

        spam       0.86      0.71      0.77        17
         ham       0.17      0.33      0.22         3

    accuracy                           0.65        20
   macro avg       0.51      0.52      0.50        20
weighted avg       0.75      0.65      0.69        20


Sample Predictions:
Text: As per your request 'Melle Melle (Oru Minnaminungi...
True label: ham, Predicted label: spam
--------------------------------------------------
Text: WINNER!! As a valued network customer you have bee...
True label: spam, Predicted label: spam
--------------------------------------------------
Text: XXXMobileMovieClub: To use your credit, click the ...
True label: spam, Predicted label: spam
--------------------------------------------------
Text: Oh k...i'm watching here:)...
True label: ham, P