In [5]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
nltk.download('punkt_tab', download_dir='./nltk_data')
nltk.download('stopwords', download_dir='./nltk_data')

nltk.data.path.append('./nltk_data')


[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.corpus import stopwords

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


In [7]:
df = pd.read_csv("spam.csv", encoding="latin1")[["v1", "v2"]]
df.columns = ["label", "message"]
df["label"] = df["label"].map({"ham": 0, "spam": 1})

stop_words = set(stopwords.words('english'))

def preprocess(text):
 tokens = word_tokenize(text.lower())  
 filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
 return filtered

tokens_list = []
for msg in df["message"]:
    tokens_list.append(preprocess(msg))

df["tokens"] = tokens_list
w2v_model = api.load("word2vec-google-news-300")

In [8]:
def vectorize(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df["vector"] = df["tokens"].apply(lambda x: vectorize(x, w2v_model))

In [9]:
df = df[df["vector"].apply(lambda x: x is not None)]
X = np.vstack(df["vector"].values)
y = df["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    return "spam" if model.predict(vector)[0] == 1 else "ham"


print(predict_message_class(clf, w2v_model, "Congratulations! You've won a free ticket."))
print(predict_message_class(clf, w2v_model, "Hey, let's catch up later."))
      

Test Accuracy: 0.9417040358744395
spam
ham
