In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
# Load the CSV file (adjust path as needed)
df = pd.read_csv('spam (Text).csv')

# Display the first few rows to understand the structure
print(df.head())

# Map labels to numbers: spam=1, ham=0
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})

X = df['Message']
y = df['Label']

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [4]:

# 2. Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['Message'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label']
)

# 3. Vectorize text (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. Train Logistic Regression (with class weight to handle imbalance)
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train_tfidf, y_train)

# 5. Predictions and evaluation
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       966
           1       0.90      0.92      0.91       149

    accuracy                           0.98      1115
   macro avg       0.94      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

[[951  15]
 [ 12 137]]


In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# 5. Predictions and evaluation
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

[[966   0]
 [ 22 127]]


In [7]:
import joblib
# Save vectorizer and model
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(model, 'rf_model.pkl')

['rf_model.pkl']

In [12]:
new_text = ["Congratulations! You've won a free ticket. Reply WIN to claim now!"]
new_text = ["Good morning"]
new_text = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005"]
# If you have a list of messages, pass them directly
new_text_tfidf = vectorizer.transform(new_text)
prediction = model.predict(new_text_tfidf)
print("Spam" if prediction[0] == 1 else "Ham")


Spam


In [13]:
probability = model.predict_proba(new_text_tfidf)
print(f"Probability of spam: {probability[0][1]:.2f}")

Probability of spam: 0.60


In [14]:
# === prepare classifier ===
encoder = joblib.load("tfidf_vectorizer.pkl")  # Load the encoder
# Load the pre-trained model
model = joblib.load("rf_model.pkl")

In [23]:
#message_text = "Congratulations! You've won a free ticket. Reply WIN to claim now!"
#message_text = "Good morning"
#message_text = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005"
message_text = "Congratulations! You've won."

# run classification

X_emb = encoder.transform([message_text])
prediction = model.predict(X_emb)

if prediction[0] == 1:
    result = "Spam"
else:
    result = "Not Spam"
print(result)

probability = model.predict_proba(X_emb)
print(f"Probability of spam: {probability[0][1]:.2f}")


Not Spam
Probability of spam: 0.28
