In [None]:
from datascience import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer as vector
from sklearn.metrics import classification_report, accuracy_score

In [None]:
scam_data = Table.read_table("../data/GroupMe Scam Message Dataset - Sheet1.csv")
X = scam_data["Message"]
y = scam_data["Scam"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                     random_state = 42)

In [None]:
#Covert Text to Vectors
vectorizer = vector(stop_words = "english")
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

#Training Random Forest
rf = RandomForestClassifier(class_weight = "balanced", n_estimators = 200, random_state = 42)
rf.fit(X_train_vector, y_train)

In [None]:
#Report Format
predictor = rf.predict(X_test_vector)
accuracy = accuracy_score(y_test, predictor)

print(f"Overall Accuracy: {accuracy: .2f}")
print("-" * 40)
print(classification_report(y_test, predictor, target_names = ["Not Scam", "Scam"]))

# Export Model and Vectorizer
### Save the trained model and vectorizer for use in main.py

In [None]:
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the Random Forest model and vectorizer
joblib.dump(rf, '../models/random_forest_model.pkl')
joblib.dump(vectorizer, '../models/random_forest_vectorizer.pkl')

print('Model saved to: ../models/random_forest_model.pkl')
print('Vectorizer saved to: ../models/random_forest_vectorizer.pkl')