<a href="https://colab.research.google.com/github/NDsasuke/Classification-Regression-Clustering/blob/main/Classification/Spam_recognition_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 import the necessary libraries:

In [64]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

download the spam data:

In [65]:
url = 'https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv'
df = pd.read_csv(url, usecols=[0,1], encoding='latin-1')
df.columns = ['label', 'message']

Map 'spam' to 1 and 'ham' to 0

In [66]:
df['label'] = df.label.map({'spam':1, 'ham':0})

Split the data into train and test sets

In [67]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2)


Convert the messages into frequency counts (bag of words model)

In [68]:
vect = CountVectorizer(stop_words='english', token_pattern=r'\b[^\d\W]+\b')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

Input shape for the model

In [69]:
input_shape = [len(vect.get_feature_names_out())]

Convert to dense arrays

In [70]:
X_train = X_train.toarray()
X_test = X_test.toarray()

 Define the model

In [71]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=input_shape),
    layers.Dense(1, activation='sigmoid'),
])


Compile the model

In [72]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


Train the model

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Evaluate the model

In [73]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.6860483288764954
Accuracy:  0.6690583229064941


Print outputs

In [74]:
# Get the feature names
feature_names = vect.get_feature_names_out()

# Print the first 100 words
print(feature_names[:100])


['_' '____' 'aa' 'aah' 'aaniye' 'aaooooright' 'aathi' 'abbey' 'abdomen'
 'abel' 'aberdeen' 'abi' 'ability' 'abiola' 'abj' 'able' 'abroad'
 'absolutly' 'abstract' 'abt' 'abta' 'aburo' 'abuse' 'ac' 'academic' 'acc'
 'accept' 'access' 'accessible' 'accident' 'accidentally' 'accommodation'
 'accommodationvouchers' 'accomodate' 'accomodations' 'accordingly'
 'account' 'accounting' 'accounts' 'achan' 'ache' 'achieve' 'acid'
 'acknowledgement' 'acnt' 'aco' 'act' 'acted' 'actin' 'acting' 'action'
 'activate' 'active' 'activities' 'actor' 'actual' 'actually' 'ad' 'adam'
 'add' 'addamsfa' 'added' 'addicted' 'addie' 'address' 'adewale' 'adi'
 'adjustable' 'admin' 'admirer' 'admission' 'admit' 'adore' 'adoring'
 'adress' 'adrian' 'ads' 'adsense' 'adult' 'advance' 'adventure' 'advice'
 'advise' 'advising' 'advisors' 'aeronautics' 'aeroplane' 'afew' 'affair'
 'affairs' 'affection' 'affectionate' 'affections' 'affidavit' 'afford'
 'afghanistan' 'afraid' 'africa' 'african' 'aft']


In [75]:
# Get the weights of the first layer
weights = model.layers[0].get_weights()[0]

# Get the indices of the weights corresponding to the highest and lowest values
# These would correspond to the features (words) that the model pays most and least attention to
top_indices = weights.argsort(axis=0)[-10:]
bottom_indices = weights.argsort(axis=0)[:10]

# Get the feature names
feature_names = vect.get_feature_names_out()

# Print the words corresponding to the highest and lowest weights
print("Words with highest weights:")
for i in top_indices:
    print(feature_names[i])

print("\nWords with lowest weights:")
for i in bottom_indices:
    print(feature_names[i])


Words with highest weights:
['warm' 'computational' 'fetching' 'dollars' 'mah' 'thinl' 'matthew'
 'rajini' 'smidgin' 'gua' 'boston' 'woods' 'series' 'munsters' 'wiv'
 'football' 'intentions' 'huge' 'created' 'barring' 'upset' 'confirmed'
 'atten' 'said' 'evn' 'august' 'soundåõs' 'hear' 'india' 'paying' 'cried'
 'gimme' 'dignity' 'ip' 'pg' 'index' 'straight' 'arestaurant' 'male' 'wld'
 's' 'neck' 'cya' 'gving' 'text' 'unfortunately' 'smsrewards' 'defer'
 'crowd' 'veggie' 'yuou' 'smth' 'nose' 'speed' 'gei' 'eppolum' 'atm'
 'dino' 'normally' 'qlynnbv' 'ntimate' 'fever' 'lar' 'dieting']
['varaya' 'vpod' 'scenario' 'internet' 'serving' 'honeymoon' 'mentioned'
 'stranger' 'chechi' 'past' 'clark' 'lovly' 'renewed' 'cyclists' 'bw'
 'bros' 'anjola' 'sec' 'demand' 'fffff' 'solved' 'sue' 'lavender' 'hussey'
 'spys' 'immediately' 'scream' 'trackmarque' 'hamster' 'told' 'waitin'
 'borderline' 'kochi' 'tomeandsaid' 'pros' 'support' 'en' 'center' 'blood'
 'hannaford' 'spontaneously' 'yalru' 'monday' 