In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'spam-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5338870%2F8870933%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240705%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240705T185255Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3e21c141f6715ac1101f99551ddfefd805c13f0f34dd935543b653dede19e3d0a325eb919f33019ef3ccd393c4c85b88affae1a4fea25fd49d2b47f50593c0112f7c2093d5f0354887ebb97ba323a4de65eb26df32d1b23c5427f5fb2a78d7ad315a68b044f0f3e25d87c2f3e195fa1ca93df1e2d2fcebd0f504de840ab0e59d916fb209ed68c67471af3e5373a6daa8dbc84c1efad5e705df96b96e1c3dd6ab934305f6087dab707a1156053d7accd29165e8ba7bba2e3f2b9e197f970658e6465f26998ffe45741d08016475d2520121cc70de6eea362a860b60e830ad7f3ed9676dbe16efbc814543b65c17fe7a3e9e00daaaf27a26529252594573313624'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading spam-dataset, 215934 bytes compressed
Downloaded and uncompressed: spam-dataset
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Modules

In [None]:
import nltk


# Taking input

In [None]:
df=pd.read_csv("/kaggle/input/spam-dataset/spam.csv",encoding='latin-1')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.rename(columns={'v1':'labels','v2':'text'},inplace=True)

In [None]:
df.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


(5572, 2)

In [None]:
df.iloc[0][0]

'ham'

# Preprocessing

In [None]:
import re
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
!unzip -o /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...


unzip:  cannot find or open /usr/share/nltk_data/corpora/wordnet.zip, /usr/share/nltk_data/corpora/wordnet.zip.zip or /usr/share/nltk_data/corpora/wordnet.zip.ZIP.


[nltk_data]   Unzipping corpora/words.zip.


In [None]:
def preprocess(corpus):
    corpus=corpus.lower()
    words=word_tokenize(corpus)
    #removing stopwords
    words=[w for w in words if not w in stop_words ]
    #removing punctuation
    cleaned= [re.sub(r'[^\w\s]', '', token) for token in words if re.sub(r'[^\w\s]', '', token)]

#     corpus = [lemmatizer.lemmatize(word) for word in cleaned]
    return cleaned



In [None]:
import gensim.downloader as api

# Load GloVe model
glove_model = api.load("glove-wiki-gigaword-100")  # 100-dimensional GloVe vectors




In [None]:
model=glove_model
def text_to_embeddings(text):
    embedding=np.zeros(model.vector_size)
    valid_word=0
    for word in text:
        if word in model:
            embedding+=model[word]
            valid_word+=1
    if valid_word>0:
        embedding/=valid_word

    return embedding

In [None]:
for i in range(df.shape[0]):
    corpus=df.iloc[i][1]
    corpus=preprocess(corpus)
    df.iloc[i][1]=text_to_embeddings(corpus)

    if(i%100==0):
        print(f'{i}.{df.iloc[i][0]}:{corpus}')




0.ham:['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']
100.ham:['okay', 'name', 'ur', 'price', 'long', 'legal', 'wen', 'pick', 'u', 'ave', 'x', 'ams', 'xx']
200.ham:['sent', 'lt', 'gt', 'bucks']
300.ham:['awesome', 'remember', 'last', 'time', 'got', 'somebody', 'high', 'first', 'time', 'diesel', 'v']
400.spam:['free', 'ringtone', 'text', 'first', '87131', 'poly', 'text', 'get', '87131', 'true', 'tone', 'help', '0845', '2814032', '16', '1st', 'free', 'tones', '3xå150pw', 'eånd', 'txt', 'stop']
500.ham:['fighting', 'world', 'easy', 'u', 'either', 'win', 'lose', 'bt', 'fightng', 'some1', 'close', 'u', 'dificult', 'u', 'lose', 'u', 'lose', 'u', 'win', 'u', 'still', 'lose']
600.ham:['mind', 'blastin', 'tsunamis', 'occur', 'rajnikant', 'stopped', 'swimming', 'indian', 'ocean', 'd']
700.ham:['much', 'r', 'ì_', 'willing', 'pay']
800.spam:['last', 'chance', 'claim', 'ur', 'å150', 'worth', 'discount', 'vouchers', 

# Splitting the dataset

In [None]:
X = np.array(df['text'].tolist())
Y=df['labels']

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

In [None]:
X[0]

array([-0.05918937,  0.07337588,  0.25856538, -0.02353659, -0.15043531,
        0.11440406,  0.04923962,  0.24415666,  0.02678226, -0.12291641,
        0.24561794, -0.0471365 , -0.07250625,  0.02821593,  0.03220781,
       -0.16815124,  0.17508032, -0.01834675, -0.25474105,  0.32546962,
        0.47869151,  0.26115429,  0.00677612, -0.11002887,  0.25488084,
        0.1717145 ,  0.10902219, -0.06052206,  0.13065594, -0.23724187,
       -0.12778981,  0.27703525,  0.07325162,  0.16444562,  0.18835919,
        0.24226403,  0.04159175,  0.21951931,  0.13562988, -0.26649018,
        0.04803331, -0.03659412, -0.24343425, -0.30099068,  0.11508219,
        0.25914812, -0.10881699, -0.18706569,  0.09885563, -0.10801812,
       -0.27193513,  0.16061344, -0.007965  ,  0.17332376, -0.50743412,
       -1.35893125, -0.12336569,  0.21353975,  0.84468373,  0.07358543,
       -0.36472579,  0.34791343, -0.37412387, -0.15056607,  0.30511539,
        0.07264219,  0.18922813, -0.00659894,  0.20287397,  0.01

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:

# Dictionary to store model names and their accuracies
model_accuracies = {}


# Applying models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr=LogisticRegression()
lr.fit(X_train,y_train)
prediction_lr=lr.predict(X_test)
model_accuracies['Logistic Regression'] = accuracy_score(prediction_lr,y_test)
print(f'Accuracy:{accuracy_score(prediction_lr,y_test)}')


Accuracy:0.9291479820627803


In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
predictions_svc = svc.predict(X_test)
model_accuracies['SVM'] = accuracy_score(predictions_svc, y_test)
print(f'Accuracy: {accuracy_score(predictions_svc, y_test)}')
print(classification_report(y_test, predictions_svc))


Accuracy: 0.9291479820627803
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       965
           1       0.75      0.71      0.73       150

    accuracy                           0.93      1115
   macro avg       0.85      0.84      0.84      1115
weighted avg       0.93      0.93      0.93      1115



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
predictions_rf = rf.predict(X_test)
model_accuracies['RandomForest'] = accuracy_score(predictions_rf, y_test)
print(f'Accuracy: {accuracy_score(predictions_rf, y_test)}')
print(classification_report(y_test, predictions_rf))


Accuracy: 0.9632286995515695
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbc.fit(X_train, y_train)
predictions_gbc = gbc.predict(X_test)
model_accuracies['GradientBoosting'] = accuracy_score(predictions_gbc, y_test)
print(f'Accuracy: {accuracy_score(predictions_gbc, y_test)}')
print(classification_report(y_test, predictions_gbc))


Accuracy: 0.9650224215246637
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.96      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.96      0.88      0.92      1115
weighted avg       0.96      0.97      0.96      1115



In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
predictions_knn = knn.predict(X_test)
model_accuracies['KNN'] = accuracy_score(predictions_knn, y_test)
print(f'Accuracy: {accuracy_score(predictions_knn, y_test)}')
print(classification_report(y_test, predictions_knn))


Accuracy: 0.9408071748878923
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       965
           1       0.74      0.86      0.80       150

    accuracy                           0.94      1115
   macro avg       0.86      0.91      0.88      1115
weighted avg       0.95      0.94      0.94      1115



In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
predictions_nb = nb.predict(X_test)
model_accuracies['GaussianNB'] = accuracy_score(predictions_nb, y_test)
print(f'Accuracy: {accuracy_score(predictions_nb, y_test)}')
print(classification_report(y_test, predictions_nb))


Accuracy: 0.8663677130044843
              precision    recall  f1-score   support

           0       0.98      0.87      0.92       965
           1       0.50      0.87      0.64       150

    accuracy                           0.87      1115
   macro avg       0.74      0.87      0.78      1115
weighted avg       0.91      0.87      0.88      1115



In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_clf.fit(X_train, y_train)
predictions_xgb = xgb_clf.predict(X_test)
model_accuracies['XGBoost'] = accuracy_score(predictions_xgb, y_test)
print(f'Accuracy: {accuracy_score(predictions_xgb, y_test)}')
print(classification_report(y_test, predictions_xgb))


Accuracy: 0.9695067264573991
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.96      0.81      0.88       150

    accuracy                           0.97      1115
   macro avg       0.97      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



# Using neuralnetworks

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
X_train[0].shape

(100,)

In [None]:
model=Sequential([
    Dense(512,activation='relu',input_shape=(100,)),
    Dense(256,activation='relu'),
    Dense(64,activation='relu'),
    Dense(32,activation='relu'),
    Dense(1,activation='sigmoid')
])


In [None]:
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])

In [None]:
history=model.fit(X_train,y_train,epochs=25,verbose=1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
predictions_tf = model.predict(X_test)
predictions_tf_binary = (predictions_tf > 0.5).astype(int)
model_accuracies['Tensorflow'] = accuracy_score(predictions_tf_binary, y_test)
print(f'Accuracy: {accuracy_score(predictions_tf_binary, y_test)}')



Accuracy: 0.9704035874439462


In [None]:
# Create a DataFrame from the model accuracies dictionary
accuracy_df = pd.DataFrame(list(model_accuracies.items()), columns=['Model', 'Accuracy'])
accuracy_df_sorted=accuracy_df.sort_values(by='Accuracy', ascending=False)

print(accuracy_df_sorted)

                 Model  Accuracy
7           Tensorflow  0.970404
6              XGBoost  0.969507
3     GradientBoosting  0.965022
2         RandomForest  0.963229
4                  KNN  0.940807
1                  SVM  0.929148
0  Logistic Regression  0.929148
5           GaussianNB  0.866368
