In [None]:
import pandas as pd  # For data manipulation
import pickle  # For loading the saved model and vectorizer
import nltk  # For natural language processing
from nltk.corpus import stopwords  # For stopword removal
from nltk.tokenize import word_tokenize  # For tokenizing the text
from nltk.stem.porter import PorterStemmer  # For stemming the words
import string  # For handling punctuation

# Initialize PorterStemmer for stemming
ps = PorterStemmer()

# Download required NLTK resources
nltk.download('stopwords')  # Stopwords dataset
nltk.download('punkt')  # Punkt tokenizer for tokenization

# Function to preprocess the text data
def preprocess_text(text):
    """
    Preprocess the text by tokenizing, converting to lowercase,
    removing non-alphanumeric tokens, stopwords, and punctuation.
    Optionally, stemming can be applied (commented out here).
    """
    # Tokenize the text, convert to lowercase, and remove non-alphanumeric tokens
    tokens = [word for word in nltk.word_tokenize(text.lower()) if word.isalnum()]
    
    # Remove stopwords (common words like 'the', 'and', etc.) and punctuation
    stop_words = set(stopwords.words('english'))  # Set of stopwords
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Optional: Apply stemming to each token (currently commented out)
    # tokens = [ps.stem(word) for word in tokens]
    
    # Return the processed text as a string
    return " ".join(tokens)

# Load the saved machine learning model and vectorizer
# Ensure to update the path to where your files are stored
with open('path_to_model/spam_svm_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)  # Load the trained SVM model

with open('path_to_model/tfidf_vectorizer.pkl', 'rb') as vec_file:
    loaded_vectorizer = pickle.load(vec_file)  # Load the trained TF-IDF vectorizer

# Upload the CSV file and load it into a DataFrame
# You can replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = 'your_file.csv'  # Update this path with your actual CSV file
data = pd.read_csv(csv_file_path)

# If the CSV doesn't have a 'label' column, we can handle that too just uncomment below 2 lines:
# if 'label' not in data.columns:
#     data['label'] = None  # Adding a 'label' column with None if it doesn't exist

# Initialize an empty list to store predictions
predictions = []

# Iterate through the DataFrame to preprocess and predict each email's label
for index, row in data.iterrows():
    email_text = row['text']  # Get the text of the email
    
    # Preprocess the email text (tokenize, remove stopwords, etc.)
    preprocessed_text = preprocess_text(email_text)
    
    # Vectorize the preprocessed text using the loaded TF-IDF vectorizer
    email_vectorized = loaded_vectorizer.transform([preprocessed_text]).toarray()
    
    # Predict the label (spam or ham) using the loaded SVM model
    prediction = loaded_model.predict(email_vectorized)
    
    # Add the prediction (either 1 or 0) to the predictions list
    predictions.append(prediction[0])

# Add the predictions as a new column 'predicted_label' to the DataFrame
data['predicted_label'] = predictions

# Show the results, displaying the original text, actual label (if available), and predicted label
print(data[['text', 'label', 'predicted_label']])
