In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib

# Load the pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the dataset
data2 = pd.read_parquet('spam2.parquet', engine='fastparquet')
data2['verdict'] = data2['label'].apply(lambda x: 1 if x == 'spam' else 0)

# Generate embeddings for the dataset
embeddings = model.encode(data2['text'].tolist())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, data2['verdict'], test_size=0.2, random_state=42)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
# Save the trained model
joblib.dump(classifier, 'spam_classifier_model.pkl')

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred, target_names=['Non-Spam', 'Spam']))

Accuracy: 0.9963302752293578
              precision    recall  f1-score   support

    Non-Spam       1.00      0.99      1.00       273
        Spam       0.99      1.00      1.00       272

    accuracy                           1.00       545
   macro avg       1.00      1.00      1.00       545
weighted avg       1.00      1.00      1.00       545

