In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('/content/SCD.csv')

print("First few rows of the dataset:")
print(df.head())
print("\nColumn names of the dataset:")
print(df.columns)

df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

df = df[['label', 'message']]

print("\nChecking for missing values:")
print(df.isnull().sum())
df.dropna(inplace=True)

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

X = df['message']
y = df['label']

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_tfidf = tfidf_vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

logistic_regression = LogisticRegression()
naive_bayes_classifier = MultinomialNB()
support_vector_machine = SVC()

logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
print("\nLogistic Regression")
print(f'Accuracy: {accuracy_score(y_test, y_pred_lr):.2f}')
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

naive_bayes_classifier.fit(X_train, y_train)
y_pred_nb = naive_bayes_classifier.predict(X_test)
print("\nNaive Bayes")
print(f'Accuracy: {accuracy_score(y_test, y_pred_nb):.2f}')
print(classification_report(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))

support_vector_machine.fit(X_train, y_train)
y_pred_svm = support_vector_machine.predict(X_test)
print("\nSupport Vector Machine")
print(f'Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}')
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))

First few rows of the dataset:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Column names of the dataset:
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

Checking for missing values:
label      0
message    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'ham': 0, 'spam': 1})



Logistic Regression
Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1453
           1       0.98      0.59      0.74       219

    accuracy                           0.94      1672
   macro avg       0.96      0.79      0.85      1672
weighted avg       0.95      0.94      0.94      1672

[[1450    3]
 [  90  129]]

Naive Bayes
Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1453
           1       1.00      0.75      0.86       219

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.97      1672

[[1453    0]
 [  54  165]]

Support Vector Machine
Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1453
           1       0.99      0.79      0.88       219

    accuracy             