In [None]:
#email/sms spam detecion module using ensemble learning using : 1- MultinomialNB()  2-LinearSVC()  3-RandomForest()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from google.colab import drive
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Downloading Dataset into Colab Environment
!curl "https://raw.githubusercontent.com/erfan-hamidi/smart-text-analysis-dj/master/sms_dataset.tsv" --output sms_dataset.tsv

df = pd.read_csv("sms_dataset.tsv", delimiter='\t')

X = df.message   # X_feature
y = df.label   # y_label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

count_vectorizer.fit(X_train)
X_train_cv = count_vectorizer.transform(X_train)
tfidf_transformer.fit(X_train_cv)
X_train_tfidf = tfidf_transformer.transform(X_train_cv)

X_test_cv = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_cv)

# Initialize the classifiers
naive_bayes = MultinomialNB()
linear_svc = LinearSVC(C=0.8)
random_forest = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3)
grid_search.fit(X_train_tfidf, y_train)
best_params = grid_search.best_params_

best_random_forest = RandomForestClassifier(**best_params)

# Create the voting classifier
voting_classifier = VotingClassifier(
    estimators=[('nb', naive_bayes), ('svc', linear_svc), ('dt', best_random_forest)],
    voting='hard'  # Use 'hard' voting for majority voting
)


voting_classifier.fit(X_train_tfidf, y_train)


prediction = voting_classifier.predict(X_test_tfidf)


ConfusionMatrixDisplay.from_estimator(voting_classifier, X_test_tfidf, y_test)
plt.show()


print(classification_report(y_test, prediction))
print("Accuracy: ", accuracy_score(y_test, prediction))


model_filename = 'ensemble_model.joblib'
joblib.dump(voting_classifier,"/content/drive"+ model_filename)


def predict_spam(input_string):
    # Load the saved model
    loaded_model = joblib.load("/content/drive"+model_filename)


    input_tfidf = tfidf_transformer.transform(count_vectorizer.transform([input_string]))


    prediction = loaded_model.predict(input_tfidf)

    return "Spam" if prediction[0] == 1 else "Not Spam"
input_string =input()
result = predict_spam(input_string)
print(f"Input: '{input_string}' - Prediction: {result}")




In [None]:
input_string =input()
result = predict_spam(input_string)
print(f"Input: '{input_string}' - Prediction: {result}")