In [None]:
import pandas as pd

path="hamvsspam.csv"

df = pd.read_csv("hamvsspam.csv", encoding='latin1')

# Keep only the first two columns (label and message)
df = df.iloc[:, :2]

# Rename the columns
df.columns = ['label', 'message']

print(df.head())



  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:

X=df['message']
y=df['label']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25,random_state=42)

### Text Preprocessing

In [None]:
!pip install autocorrect emoji contractions textacy nltk

from autocorrect import Speller
from emoji import demojize
from contractions import fix
from textacy.preprocessing.remove import accents
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Initialize necessary tools
speller = Speller()
stopword = stopwords.words("english")
stem = SnowballStemmer("english")
lem = WordNetLemmatizer()

def text_pre_processing(text):
    # Lower case the text
    text = text.lower()
    # Auto Correct
    text = speller.autocorrect_sentence(text)
    # Emoji to text
    text = demojize(text)
    # Fix contractions
    text = fix(text)
    # Remove accents
    text = accents(text)
    # Remove non-alphanumeric characters (punctuation)
    text = re.sub(r"[^a-z0-9]", " ", text)

    # Tokenizing and applying stop words, stemming, and lemmatizing
    words = word_tokenize(text)
    new_text = []
    for word in words:
        if word not in stopword:
            word = stem.stem(word)
            word = lem.lemmatize(word)
            new_text.append(word)

    return " ".join(new_text)

data = pd.read_csv("hamvsspam.csv", encoding='latin1')
# Keep only the first two columns (label and message)
data = data.iloc[:, :2]

# Rename the columns
data.columns = ['label', 'message']

# Apply the text preprocessing to the 'Message' column
data["message"] = data["message"].apply(text_pre_processing)

# Save the processed dataset to a new CSV file
data.to_csv("processed_dataset.csv", index=False)


Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/622.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/622.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting cytoolz>=0.10.1 (from textacy)
  Downloading cytoolz-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux20

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Text Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


label = data['label']
features = data['message']

# Split the data into training and testing sets (optional but common)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.25, random_state=42)

# Initialize CountVectorizer (Bag-of-Words)
bow = CountVectorizer(stop_words="english", strip_accents="unicode")

# Fit and transform the training data
X_train_bow = bow.fit_transform(X_train)

# Transform test data (if needed later)
X_test_bow = bow.transform(X_test)

# Convert to DataFrame for inspection or further processing
X_train_vec = pd.DataFrame(X_train_bow.toarray(), columns=bow.get_feature_names_out())
y_train_vec = pd.DataFrame(y_train.reset_index(drop=True)).rename(columns={0: "label"})


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train SVM classifier
svm_clf = SVC(kernel='linear')  # You can try other kernels like 'rbf' or 'poly'
svm_clf.fit(X_train_bow, y_train)

# Predict on train set
y_pred_train = svm_clf.predict(X_train_bow)

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_train, y_pred_train))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_pred_train))


# Predict on test set
y_pred_test = svm_clf.predict(X_test_bow)

# Evaluate the model
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_test))



Train Accuracy: 0.9998370804822417

Classification Report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00      5321
        spam       1.00      1.00      1.00       817

    accuracy                           1.00      6138
   macro avg       1.00      1.00      1.00      6138
weighted avg       1.00      1.00      1.00      6138


Confusion Matrix:
 [[5321    0]
 [   1  816]]
Test Accuracy: 0.9895765472312703

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1311
        spam       0.99      0.94      0.96       224

    accuracy                           0.99      1535
   macro avg       0.99      0.97      0.98      1535
weighted avg       0.99      0.99      0.99      1535


Confusion Matrix:
 [[1309    2]
 [  14  210]]
