In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
dataset_path = "/content/Cleaned Smart Email Dataset.csv"  # Adjust path accordingly
data = pd.read_csv(dataset_path)

# Remove unclassified entries
filtered_data = data[data["Category"] != "-"]

# Merge subject and email body
filtered_data["combined_text"] = filtered_data["Mail Subject"].fillna(" ") + " " + filtered_data["Email Content"].fillna(" ")

# Remove infrequent categories
category_freq = filtered_data["Category"].value_counts()
valid_labels = category_freq[category_freq > 1].index
balanced_data = filtered_data[filtered_data["Category"].isin(valid_labels)]

# Define inputs (X) and target labels (y)
X_features = balanced_data["combined_text"]
y_labels = balanced_data["Category"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=42, stratify=y_labels)

# Convert text data using TF-IDF
tfidf_converter = TfidfVectorizer(stop_words="english", max_features=4000)  # Reduced feature count
X_train_transformed = tfidf_converter.fit_transform(X_train)
X_test_transformed = tfidf_converter.transform(X_test)

# Train Naive Bayes model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_transformed, y_train)

# Make predictions and evaluate model
predictions = nb_classifier.predict(X_test_transformed)
model_accuracy = accuracy_score(y_test, predictions)
classification_summary = classification_report(y_test, predictions)

print(f"Model Accuracy: {model_accuracy:.4f}")
print("Detailed Classification Report:\n", classification_summary)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["combined_text"] = filtered_data["Mail Subject"].fillna(" ") + " " + filtered_data["Email Content"].fillna(" ")


Model Accuracy: 0.7647
Detailed Classification Report:
                             precision    recall  f1-score   support

                  Business       0.57      1.00      0.73         4
                   Finance       0.00      0.00      0.00         1
        Order Confirmation       0.00      0.00      0.00         1
Order Confirmation/Updates       1.00      1.00      1.00         2
                  Personal       1.00      1.00      1.00         3
               Promotional       0.75      0.75      0.75         4
    Transaction & Security       1.00      0.50      0.67         2

                  accuracy                           0.76        17
                 macro avg       0.62      0.61      0.59        17
              weighted avg       0.72      0.76      0.72        17



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
