<a href="https://colab.research.google.com/github/Kenzie218/CS420_Project/blob/main/HN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import requests
import zipfile
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Download and extract the SMS Spam Collection
uci_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
response = requests.get(uci_url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    with z.open('SMSSpamCollection') as f:
        df = pd.read_csv(f, sep='\t', names=['label', 'message'])
df['spam'] = df['label'].map({'ham': 0, 'spam': 1})

# 2. Output overall counts
print("Total messages:", len(df))
print(df['label'].value_counts())

# 3. (Optional) Subsample for speed
df = df.sample(n=2000, random_state=42).reset_index(drop=True)

# 4. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['spam'],
    test_size=0.3, random_state=42, stratify=df['spam']
)

print("\nTraining set label counts:\n", y_train.value_counts())
print("\nTest set label counts:\n", y_test.value_counts())

# 5. Feature pipeline: TF-IDF → SVD → binarize
tfidf = TfidfVectorizer(max_features=2000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf  = tfidf.transform(X_test).toarray()

svd = TruncatedSVD(n_components=100, random_state=42)
X_train_red = svd.fit_transform(X_train_tfidf)
X_test_red  = svd.transform(X_test_tfidf)

X_train_bip = np.where(X_train_red >= 0, 1, -1)
X_test_bip  = np.where(X_test_red  >= 0, 1, -1)

# 6. Hopfield Network definition
class HopfieldNetwork:
    def __init__(self):
        self.W = None

    def train(self, patterns):
        n = patterns.shape[1]
        W = np.zeros((n, n))
        for p in patterns:
            W += np.outer(p, p)
        np.fill_diagonal(W, 0)
        self.W = W / patterns.shape[0]

    def recall(self, pattern, steps=200):
        s = pattern.copy()
        for _ in range(steps):
            i = np.random.randint(s.size)
            s[i] = 1 if self.W[i].dot(s) >= 0 else -1
        return s

# 7. Train on spam patterns only
spam_patterns = X_train_bip[y_train.values == 1]
net = HopfieldNetwork()
net.train(spam_patterns)

# 8. Predict and evaluate
y_pred = []
for x in X_test_bip:
    r = net.recall(x)
    y_pred.append(int(any(np.array_equal(r, p) for p in spam_patterns)))
y_pred = np.array(y_pred)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Total messages: 5572
label
ham     4825
spam     747
Name: count, dtype: int64

Training set label counts:
 spam
0    1218
1     182
Name: count, dtype: int64

Test set label counts:
 spam
0    522
1     78
Name: count, dtype: int64

Accuracy: 0.87

Confusion Matrix:
 [[522   0]
 [ 78   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       522
           1       0.00      0.00      0.00        78

    accuracy                           0.87       600
   macro avg       0.43      0.50      0.47       600
weighted avg       0.76      0.87      0.81       600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
