**Libraries**

In [46]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Loading CSV**

In [33]:
text = pd.read_csv('/content/spam.csv', sep=',', encoding='latin-1')
text.drop(columns=text.columns[text.columns.str.contains('unnamed', case=False)], inplace=True)
print(text.head(10))

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...


**Preprocessing**

In [34]:
def preprocessing(text):
  # Removing HTML tags
  text_cleaned = re.sub(r'<[^>]+>', '', text)

  # Removing URLs
  text_cleaned = re.sub(r'http\S+|www\S+|https\S+', '', text_cleaned)

  # Converting to lowercase and removing non-alphanumeric characters
  text_cleaned = re.sub(r'[^a-zA-Z\s]', '', text_cleaned.lower())

  # Tokenization
  tokens = word_tokenize(text_cleaned)

  # Removing stopwords
  stop_words = set(stopwords.words('english'))
  tokens_without_sw = [word for word in tokens if word not in stop_words]

  # Regex: Remove punctuation marks
  tokens_without_punctuation = [word for word in tokens_without_sw if re.match(r'^\w+$', word)]

  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_without_punctuation]

  return " ".join(lemmatized_tokens)

text['processed_text'] = text['v2'].apply(preprocessing)
print(text)

        v1                                                 v2  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   
...    ...                                                ...   
5567  spam  This is the 2nd time we have tried 2 contact u...   
5568   ham              Will Ì_ b going to esplanade fr home?   
5569   ham  Pity, * was in mood for that. So...any other s...   
5570   ham  The guy did some bitching but I acted like i'd...   
5571   ham                         Rofl. Its true to its name   

                                         processed_text  
0     go jurong point crazy available bugis n great ...  
1                               ok lar joking wif u oni  
2     free entry wkly comp win fa cup final t

**Bag of Words**

In [36]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(text['processed_text'])
print(bow_matrix)



  (0, 2640)	1
  (0, 3456)	1
  (0, 5057)	1
  (0, 1445)	1
  (0, 458)	1
  (0, 879)	1
  (0, 2728)	1
  (0, 7590)	1
  (0, 3600)	1
  (0, 877)	1
  (0, 1182)	1
  (0, 2692)	1
  (0, 231)	1
  (0, 7369)	1
  (1, 4643)	1
  (1, 3633)	1
  (1, 3426)	1
  (1, 7492)	1
  (1, 4674)	1
  (2, 2445)	1
  (2, 2059)	2
  (2, 7546)	1
  (2, 1290)	1
  (2, 7505)	1
  (2, 2188)	2
  :	:
  (5567, 866)	1
  (5568, 3006)	1
  (5568, 2655)	1
  (5568, 2432)	1
  (5568, 2086)	1
  (5569, 4245)	1
  (5569, 6485)	1
  (5569, 4982)	1
  (5569, 6140)	1
  (5570, 2445)	1
  (5570, 7415)	1
  (5570, 3135)	1
  (5570, 3737)	1
  (5570, 4445)	1
  (5570, 6163)	1
  (5570, 2005)	1
  (5570, 2555)	1
  (5570, 910)	1
  (5570, 2788)	1
  (5570, 3265)	1
  (5570, 60)	1
  (5570, 686)	1
  (5571, 4363)	1
  (5571, 6994)	1
  (5571, 5635)	1


In [38]:
# Convert the BoW matrix to a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print(bow_df)

      aa  aah  aaniye  aaooooright  aathilove  aathiwhere  ab  abbey  abdomen  \
0      0    0       0            0          0           0   0      0        0   
1      0    0       0            0          0           0   0      0        0   
2      0    0       0            0          0           0   0      0        0   
3      0    0       0            0          0           0   0      0        0   
4      0    0       0            0          0           0   0      0        0   
...   ..  ...     ...          ...        ...         ...  ..    ...      ...   
5567   0    0       0            0          0           0   0      0        0   
5568   0    0       0            0          0           0   0      0        0   
5569   0    0       0            0          0           0   0      0        0   
5570   0    0       0            0          0           0   0      0        0   
5571   0    0       0            0          0           0   0      0        0   

      abeg  ...  zed  zero 

**TF IDF**

In [40]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(text['processed_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print(tfidf_df)

       aa  aah  aaniye  aaooooright  aathilove  aathiwhere   ab  abbey  \
0     0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
1     0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
2     0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
3     0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
4     0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
...   ...  ...     ...          ...        ...         ...  ...    ...   
5567  0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
5568  0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
5569  0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
5570  0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   
5571  0.0  0.0     0.0          0.0        0.0         0.0  0.0    0.0   

      abdomen  abeg  ...  zed  zero   zf  zhong  zindgi  zoe  zogtorius  zoom  \
0         0.0   0.0  ...  0.0 

**Logistic regression model on TF IDF and BoW**

In [42]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(tfidf_matrix, text['v1'], test_size=0.2, random_state=42)

Logistic Regression Model with TF IDF

In [43]:
# Train Logistic Regression with TF-IDF
logreg_tfidf = LogisticRegression()
logreg_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)

# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Logistic Regression Accuracy with TF-IDF:", accuracy_tfidf)

Logistic Regression Accuracy with TF-IDF: 0.9488789237668162


Logistic Regression Model with BOW

In [44]:
X_train_bow, X_test_bow, y_train, y_test = train_test_split(bow_matrix, text['v1'], test_size=0.2, random_state=42)

In [45]:
# Train Logistic Regression with Bag of Words
logreg_bow = LogisticRegression()
logreg_bow.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred_bow = logreg_bow.predict(X_test_bow)

# Evaluate the model
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("Logistic Regression Accuracy with Bag of Words:", accuracy_bow)

Logistic Regression Accuracy with Bag of Words: 0.9775784753363229


**SVM Model on TF-IDF and Bag of Words**

In [47]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(tfidf_matrix, text['v1'], test_size=0.2, random_state=42)

SVM Model on TF-IDF

In [48]:
# Train SVM with TF-IDF
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)

# Evaluate the model
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("SVM Accuracy with TF-IDF:", accuracy_tfidf)

SVM Accuracy with TF-IDF: 0.9757847533632287


SVM Model on BOW

In [49]:
# Split the data into train and test sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(bow_matrix, text['v1'], test_size=0.2, random_state=42)

In [50]:
# Train SVM with Bag of Words
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred_bow = svm_bow.predict(X_test_bow)

# Evaluate the model
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("SVM Accuracy with Bag of Words:", accuracy_bow)

SVM Accuracy with Bag of Words: 0.9748878923766816
