In [317]:
# main_df = None

main_train_df = None
main_test_df = None

In [9]:
import os
import kagglehub
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def prepare_dataset(dataset, label, text):
  dataset = dataset.copy()

  encoder = LabelEncoder()
  dataset[label] = encoder.fit_transform(dataset[label])

  dataset = dataset.rename(columns={label: 'label', text: 'text'})
  dataset = dataset.drop_duplicates(keep = 'first')
  dataset = dataset.dropna()

  dataset = dataset[['label', 'text']]

  print(dataset.head())
  print(dataset.isnull().sum())
  print(dataset['label'].unique())
  print(dataset.shape)

  return dataset

# def concat_with(dataset):
#   global main_df

#   if main_df is None:
#     main_df = dataset
#   else:
#     dataset = dataset[['label', 'text']]
#     main_df = pd.concat([main_df, dataset], ignore_index=True)

#   main_df = main_df.drop_duplicates(subset=['label', 'text'], keep='first')
#   main_df = main_df.dropna()

#   print(main_df.head())
#   print(main_df.isnull().sum())
#   print(main_df['label'].unique())
#   print(main_df.shape)

#   return main_df

main_train_df = None
main_test_df = None

def concat_with(dataset):
    # TODO: delete empty rows
    global main_train_df, main_test_df

    dataset = dataset[['label', 'text']].drop_duplicates(subset=['label', 'text'], keep='first').dropna()

    if main_train_df is not None:
        dataset = dataset[~dataset.apply(lambda row: ((row['label'], row['text']) in zip(main_train_df['label'], main_train_df['text'])), axis=1)]

    if main_test_df is not None:
        dataset = dataset[~dataset.apply(lambda row: ((row['label'], row['text']) in zip(main_test_df['label'], main_test_df['text'])), axis=1)]

    if dataset.empty:
        return main_train_df, main_test_df

    train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

    if main_train_df is None:
        main_train_df = train_df
    else:
        main_train_df = pd.concat([main_train_df, train_df], ignore_index=True)

    if main_test_df is None:
        main_test_df = test_df
    else:
        main_test_df = pd.concat([main_test_df, test_df], ignore_index=True)

    print("Train shape:", main_train_df.shape)
    print("Test shape:", main_test_df.shape)

    return main_train_df, main_test_df

In [319]:
# Download latest version
path = kagglehub.dataset_download("abdallahwagih/spam-emails")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "spam.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['spam.csv']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [320]:
df = prepare_dataset(df, "Category", "Message")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5157, 2)


In [321]:
concat_with(df)

Train shape: (4125, 2)
Test shape: (1032, 2)


(      label                                               text
 2598      0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 5418      0  So how are you really. What are you up to. How...
 99        0                    I see a cup of coffee animation
 2321      0      This pain couldn't have come at a worse time.
 2388      0                             Also where's the piece
 ...     ...                                                ...
 4750      0  Thanx u darlin!im cool thanx. A few bday drink...
 474       1  Want 2 get laid tonight? Want real Dogging loc...
 3273      0  MOON has come to color your dreams, STARS to m...
 4022      0              We have to pick rayan macleran there.
 882       0  see, i knew giving you a break a few times wou...
 
 [4125 rows x 2 columns],
       label                                               text
 3031      0  Also sir, i sent you an email about how to log...
 495       0                   Are you free now?can i call now?
 2942      0

In [322]:
# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "spam.csv")
df = pd.read_csv(csv_file_path, encoding='latin1')

df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)

print(df.head())

Files in dataset directory: ['spam.csv']
     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [323]:
df = prepare_dataset(df, "v1", "v2")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5169, 2)


In [324]:
concat_with(df)

Train shape: (4694, 2)
Test shape: (1175, 2)


(      label                                               text
 0         0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1         0  So how are you really. What are you up to. How...
 2         0                    I see a cup of coffee animation
 3         0      This pain couldn't have come at a worse time.
 4         0                             Also where's the piece
 ...     ...                                                ...
 4689      1  our mobile number has won å£5000, to claim cal...
 4690      1  5 Free Top Polyphonic Tones call 087018728737,...
 4691      0  Beautiful Truth against Gravity.. Read careful...
 4692      0  'An Amazing Quote'' - \Sometimes in life its d...
 4693      0  Wishing you and your family Merry \X\" mas and...
 
 [4694 rows x 2 columns],
       label                                               text
 0         0  Also sir, i sent you an email about how to log...
 1         0                   Are you free now?can i call now?
 2         0

In [325]:
# Download latest version
path = kagglehub.dataset_download("ashfakyeafi/spam-email-classification")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "email.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['email.csv']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [326]:
df["Category"].unique()

array(['ham', 'spam', '{"mode":"full"'], dtype=object)

In [327]:
df = df[df['Category'] != '{"mode":"full"']
df["Category"].unique()

array(['ham', 'spam'], dtype=object)

In [328]:
df = prepare_dataset(df, "Category", "Message")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5157, 2)


In [329]:
concat_with(df)

(      label                                               text
 0         0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1         0  So how are you really. What are you up to. How...
 2         0                    I see a cup of coffee animation
 3         0      This pain couldn't have come at a worse time.
 4         0                             Also where's the piece
 ...     ...                                                ...
 4689      1  our mobile number has won å£5000, to claim cal...
 4690      1  5 Free Top Polyphonic Tones call 087018728737,...
 4691      0  Beautiful Truth against Gravity.. Read careful...
 4692      0  'An Amazing Quote'' - \Sometimes in life its d...
 4693      0  Wishing you and your family Merry \X\" mas and...
 
 [4694 rows x 2 columns],
       label                                               text
 0         0  Also sir, i sent you an email about how to log...
 1         0                   Are you free now?can i call now?
 2         0

In [330]:
# Download latest version
path = kagglehub.dataset_download("ozlerhakan/spam-or-not-spam-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "spam_or_not_spam.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['spam_or_not_spam.csv']
                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


In [331]:
df = prepare_dataset(df, "label", "email")

   label                                               text
0      0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...
1      0  martin a posted tassos papadopoulos the greek ...
2      0  man threatens explosion in moscow thursday aug...
3      0  klez the virus that won t die already the most...
4      0   in adding cream to spaghetti carbonara which ...
label    0
text     0
dtype: int64
[0 1]
(2872, 2)


In [332]:
df[df["label"] == 1]["text"].iloc[0]

' save up to NUMBER on life insurance why spend more than you have to life quote savings ensuring your family s financial security is very important life quote savings makes buying life insurance simple and affordable we provide free access to the very best companies and the lowest rates life quote savings is fast easy and saves you money let us help you get started with the best values in the country on new coverage you can save hundreds or even thousands of dollars by requesting a free quote from lifequote savings our service will take you less than NUMBER minutes to complete shop and compare save up to NUMBER on all types of life insurance hyperlink click here for your free quote protecting your family is the best investment you ll ever make if you are in receipt of this email in error and or wish to be removed from our list hyperlink please click here and type remove if you reside in any state which prohibits e mail solicitations for insurance please disregard this email '

In [333]:
concat_with(df)

Train shape: (6991, 2)
Test shape: (1750, 2)


(      label                                               text
 0         0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1         0  So how are you really. What are you up to. How...
 2         0                    I see a cup of coffee animation
 3         0      This pain couldn't have come at a worse time.
 4         0                             Also where's the piece
 ...     ...                                                ...
 6986      0  on NUMBER sep NUMBER at NUMBER NUMBER guido va...
 6987      0  on wed feb NUMBER NUMBER at NUMBER NUMBER NUMB...
 6988      0   it seems that something changed during the la...
 6989      0  on mon NUMBER oct NUMBER jesse keating wrote o...
 6990      0   over on arstechnica www arstechnica com i saw...
 
 [6991 rows x 2 columns],
       label                                               text
 0         0  Also sir, i sent you an email about how to log...
 1         0                   Are you free now?can i call now?
 2         0

In [334]:
# Download latest version
path = kagglehub.dataset_download("venky73/spam-mails-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "spam_ham_dataset.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['spam_ham_dataset.csv']
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  


In [335]:
df = prepare_dataset(df, "label", "text")

   label                                               text
0      0  Subject: enron methanol ; meter # : 988291\r\n...
1      0  Subject: hpl nom for january 9 , 2001\r\n( see...
2      0  Subject: neon retreat\r\nho ho ho , we ' re ar...
3      1  Subject: photoshop , windows , office . cheap ...
4      0  Subject: re : indian springs\r\nthis deal is t...
label    0
text     0
dtype: int64
[0 1]
(5171, 2)


In [336]:
df['text'] = df['text'].str.replace(r'Subject:\s*', '', regex=True)
print(df.head())

   label                                               text
0      0  enron methanol ; meter # : 988291\r\nthis is a...
1      0  hpl nom for january 9 , 2001\r\n( see attached...
2      0  neon retreat\r\nho ho ho , we ' re around to t...
3      1  photoshop , windows , office . cheap . main tr...
4      0  re : indian springs\r\nthis deal is to book th...


In [337]:
df[df["label"] == 1]["text"].iloc[0]

'photoshop , windows , office . cheap . main trending\r\nabasements darer prudently fortuitous undergone\r\nlighthearted charm orinoco taster\r\nrailroad affluent pornographic cuvier\r\nirvin parkhouse blameworthy chlorophyll\r\nrobed diagrammatic fogarty clears bayda\r\ninconveniencing managing represented smartness hashish\r\nacademies shareholders unload badness\r\ndanielson pure caffein\r\nspaniard chargeable levin\r\n'

In [338]:
concat_with(df)

Train shape: (10985, 2)
Test shape: (2749, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 10980      1  get your hand clock repliacs todday carson\r\n...
 10981      1  a huge load inside her pussy .\r\nembattle sla...
 10982      1                         best software prices .\r\n
 10983      0  natural gas nomination for december 2000 - - r...
 10984      0  defs may 2001\r\ndaren :\r\nplease enter a dem...
 
 [10985 rows x 2 columns],
       label                                               text
 0         0  Also sir, i sent you an email about how to log...
 1         0                   Are you free now?can i call now?

In [339]:
# Download latest version
path = kagglehub.dataset_download("mfaisalqureshi/spam-email")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "spam.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['spam.csv']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [340]:
df = prepare_dataset(df, "Category", "Message")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5157, 2)


In [341]:
concat_with(df)

(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 10980      1  get your hand clock repliacs todday carson\r\n...
 10981      1  a huge load inside her pussy .\r\nembattle sla...
 10982      1                         best software prices .\r\n
 10983      0  natural gas nomination for december 2000 - - r...
 10984      0  defs may 2001\r\ndaren :\r\nplease enter a dem...
 
 [10985 rows x 2 columns],
       label                                               text
 0         0  Also sir, i sent you an email about how to log...
 1         0                   Are you free now?can i call now?

In [342]:
# Download latest version
path = kagglehub.dataset_download("purusinghvi/email-spam-classification-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "combined_data.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['combined_data.csv']
   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...


In [343]:
df = prepare_dataset(df, "label", "text")

   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...
label    0
text     0
dtype: int64
[1 0]
(83448, 2)


In [344]:
concat_with(df)

Train shape: (77743, 2)
Test shape: (19439, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 77738      0  anita . from our conversation today with daren...
 77739      0  business highlights\nenron freight markets\nen...
 77740      0  i am rebuilding r in a mandriva linux environm...
 77741      0  alternative medicine database over escapenumbe...
 77742      0   escapenumberfxml version escapenumberd escape...
 
 [77743 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [345]:
# Download latest version
path = kagglehub.dataset_download("team-ai/spam-text-message-classification")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "SPAM text message 20170820 - Data.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['SPAM text message 20170820 - Data.csv']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [346]:
df = prepare_dataset(df, "Category", "Message")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5157, 2)


In [347]:
concat_with(df)

(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 77738      0  anita . from our conversation today with daren...
 77739      0  business highlights\nenron freight markets\nen...
 77740      0  i am rebuilding r in a mandriva linux environm...
 77741      0  alternative medicine database over escapenumbe...
 77742      0   escapenumberfxml version escapenumberd escape...
 
 [77743 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [348]:
# Download latest version
path = kagglehub.dataset_download("jackksoncsie/spam-email-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "emails.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['emails.csv']
                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [349]:
df = prepare_dataset(df, "spam", "text")

   label                                               text
0      1  Subject: naturally irresistible your corporate...
1      1  Subject: the stock trading gunslinger  fanny i...
2      1  Subject: unbelievable new homes made easy  im ...
3      1  Subject: 4 color printing special  request add...
4      1  Subject: do not have money , get software cds ...
label    0
text     0
dtype: int64
[1 0]
(5695, 2)


In [350]:
df['text'] = df['text'].str.replace(r'Subject:\s*', '', regex=True)

In [351]:
concat_with(df)

Train shape: (82299, 2)
Test shape: (20578, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 82294      0  thomas knudsen  hi vince  i met with thomas th...
 82295      0  sevil yamin  vince ,  do you want me to do thi...
 82296      0  re : a request  zimin ,  i also enjoyed our ta...
 82297      0  6 / 30 aga forecast at 66  mike ,  my number f...
 82298      1  perfect visual solution for your business now ...
 
 [82299 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [352]:
# Download latest version
path = kagglehub.dataset_download("nitishabharathi/email-spam-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "completeSpamAssassin.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['enronSpamSubset.csv', 'lingSpam.csv', 'completeSpamAssassin.csv']
   Unnamed: 0                                               Body  Label
0           0  \nSave up to 70% on Life Insurance.\nWhy Spend...      1
1           1  1) Fight The Risk of Cancer!\nhttp://www.adcli...      1
2           2  1) Fight The Risk of Cancer!\nhttp://www.adcli...      1
3           3  ##############################################...      1
4           4  I thought you might like these:\n1) Slim Down ...      1


In [353]:
df = prepare_dataset(df, "Label", "Body")

   label                                               text
0      1  \nSave up to 70% on Life Insurance.\nWhy Spend...
1      1  1) Fight The Risk of Cancer!\nhttp://www.adcli...
2      1  1) Fight The Risk of Cancer!\nhttp://www.adcli...
3      1  ##############################################...
4      1  I thought you might like these:\n1) Slim Down ...
label    0
text     0
dtype: int64
[1 0]
(6045, 2)


In [354]:
concat_with(df)

Train shape: (86533, 2)
Test shape: (21637, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 86528      0  \nForwarded-by: Chris Wedgwood \nFrom: Bert01....
 86529      0  On Wed, Aug 21, 2002 at 02:52:11PM +0800, al@m...
 86530      0  \nZDNet AnchorDesk NewsletterTHURSDAY, JULY 18...
 86531      0  SEARCHSECURITY | Security and Industry News\nJ...
 86532      1  \nBODY {font-family="Arial"}\nTT {font-family=...
 
 [86533 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [355]:
# Download latest version
path = kagglehub.dataset_download("nitishabharathi/email-spam-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "lingSpam.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['enronSpamSubset.csv', 'lingSpam.csv', 'completeSpamAssassin.csv']
   Unnamed: 0                                               Body  Label
0           0  Subject: great part-time or summer job !\n \n ...      1
1           1  Subject: auto insurance rates too high ?\n \n ...      1
2           2  Subject: do want the best and economical hunti...      1
3           3  Subject: email 57 million people for $ 99\n \n...      1
4           4  Subject: do n't miss these !\n \n attention ! ...      1


In [356]:
df = prepare_dataset(df, "Label", "Body")

   label                                               text
0      1  Subject: great part-time or summer job !\n \n ...
1      1  Subject: auto insurance rates too high ?\n \n ...
2      1  Subject: do want the best and economical hunti...
3      1  Subject: email 57 million people for $ 99\n \n...
4      1  Subject: do n't miss these !\n \n attention ! ...
label    0
text     0
dtype: int64
[1 0]
(2605, 2)


In [357]:
df['text'] = df['text'].str.replace(r'Subject:\s*', '', regex=True)

In [358]:
concat_with(df)

Train shape: (88605, 2)
Test shape: (22156, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 88600      0  jireem @ utxvms . cc . utexas . edu\n \n does ...
 88601      0  references on non-human language\n \n content ...
 88602      0  call for papers : linguistics session of the m...
 88603      0  inquiry re : slang and rock music\n \n i am wo...
 88604      0  are most people bilingual ? - - summary\n \n a...
 
 [88605 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [359]:
# Download latest version
path = kagglehub.dataset_download("nitishabharathi/email-spam-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "enronSpamSubset.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['enronSpamSubset.csv', 'lingSpam.csv', 'completeSpamAssassin.csv']
   Unnamed: 0.1  Unnamed: 0  \
0          2469        2469   
1          5063        5063   
2         12564       12564   
3          2796        2796   
4          1468        1468   

                                                Body  Label  
0  Subject: stock promo mover : cwtd\n * * * urge...      1  
1  Subject: are you listed in major search engine...      1  
2  Subject: important information thu , 30 jun 20...      1  
3  Subject: = ? utf - 8 ? q ? bask your life with...      1  
4  Subject: " bidstogo " is places to go , things...      1  


In [360]:
df = prepare_dataset(df, "Label", "Body")

   label                                               text
0      1  Subject: stock promo mover : cwtd\n * * * urge...
1      1  Subject: are you listed in major search engine...
2      1  Subject: important information thu , 30 jun 20...
3      1  Subject: = ? utf - 8 ? q ? bask your life with...
4      1  Subject: " bidstogo " is places to go , things...
label    0
text     0
dtype: int64
[1 0]
(10000, 2)


In [361]:
df['text'] = df['text'].str.replace(r'Subject:\s*', '', regex=True)
print(df.head())

   label                                               text
0      1  stock promo mover : cwtd\n * * * urgent invest...
1      1  are you listed in major search engines ?\n sub...
2      1  important information thu , 30 jun 2005 .\n su...
3      1  = ? utf - 8 ? q ? bask your life with ? =\n = ...
4      1  " bidstogo " is places to go , things to do\n ...


In [362]:
concat_with(df)

Train shape: (96349, 2)
Test shape: (24093, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 96344      0  re : enron / stanford program\n nick ,\n i spo...
 96345      0  re : tenaska iv 10 / 00\n i don ' t see anythi...
 96346      0  fw : attorney client privledge - important ! n...
 96347      1  fw : neevr seen prono flash animation\n buenos...
 96348      0  january production estimate\n daren / carlos :...
 
 [96349 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [363]:
# Download latest version
path = kagglehub.dataset_download("abdmental01/email-spam-dedection")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "mail_data.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['mail_data.csv']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [364]:
df = prepare_dataset(df, "Category", "Message")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5157, 2)


In [365]:
concat_with(df)

(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 96344      0  re : enron / stanford program\n nick ,\n i spo...
 96345      0  re : tenaska iv 10 / 00\n i don ' t see anythi...
 96346      0  fw : attorney client privledge - important ! n...
 96347      1  fw : neevr seen prono flash animation\n buenos...
 96348      0  january production estimate\n daren / carlos :...
 
 [96349 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [366]:
# Download latest version
path = kagglehub.dataset_download("noeyislearning/spam-emails")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "emails.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['emails.csv']
                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [367]:
df = prepare_dataset(df, "spam", "text")

   label                                               text
0      1  Subject: naturally irresistible your corporate...
1      1  Subject: the stock trading gunslinger  fanny i...
2      1  Subject: unbelievable new homes made easy  im ...
3      1  Subject: 4 color printing special  request add...
4      1  Subject: do not have money , get software cds ...
label    0
text     0
dtype: int64
[1 0]
(5695, 2)


In [368]:
df['text'] = df['text'].str.replace(r'Subject:\s*', '', regex=True)

In [369]:
concat_with(df)

(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 96344      0  re : enron / stanford program\n nick ,\n i spo...
 96345      0  re : tenaska iv 10 / 00\n i don ' t see anythi...
 96346      0  fw : attorney client privledge - important ! n...
 96347      1  fw : neevr seen prono flash animation\n buenos...
 96348      0  january production estimate\n daren / carlos :...
 
 [96349 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [370]:
# Download latest version
path = kagglehub.dataset_download("ahsenwaheed/youtube-comments-spam-dataset")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "Youtube-Spam-Dataset.csv")
df = pd.read_csv(csv_file_path)

print(df.head())
print(df.isnull().sum())
print(df['CLASS'].unique())
print(df.shape)

Files in dataset directory: ['Youtube-Spam-Dataset.csv']
                                    COMMENT_ID            AUTHOR  \
0  LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   
1  LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   
2  LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   
3          z13jhp0bxqncu512g22wvzkasxmvvzjaz04   ElNino Melendez   
4          z13fwbwp1oujthgqj04chlngpvzmtt3r3dw            GsMega   

                  DATE                                            CONTENT  \
0  2013-11-07T06:20:48  Huh, anyway check out this you[tube] channel: ...   
1  2013-11-07T12:37:15  Hey guys check out my new channel and our firs...   
2  2013-11-08T17:34:21             just for test I have to say murdev.com   
3  2013-11-09T08:28:43   me shaking my sexy ass on my channel enjoy ^_^ ﻿   
4  2013-11-10T16:05:38            watch?v=vtaRGgvGtWQ   Check this out .﻿   

                       VIDEO_NAME  CLASS  
0  PSY - GANGNAM STYLE(?????

In [371]:
df = prepare_dataset(df, "CLASS", "CONTENT")

   label                                               text
0      1  Huh, anyway check out this you[tube] channel: ...
1      1  Hey guys check out my new channel and our firs...
2      1             just for test I have to say murdev.com
3      1   me shaking my sexy ass on my channel enjoy ^_^ ﻿
4      1            watch?v=vtaRGgvGtWQ   Check this out .﻿
label    0
text     0
dtype: int64
[1 0]
(1710, 2)


In [372]:
concat_with(df)

Train shape: (97595, 2)
Test shape: (24405, 2)


(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 97590      0  Rihanna is so beautiful and amazing ♥♥♥♥♥love ...
 97591      0                                         waka waka﻿
 97592      0             I hate it when Laura Bennett comes in﻿
 97593      1  Hey Music Fans I really appreciate any of you ...
 97594      0                    I could hear this for years ;3﻿
 
 [97595 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [373]:
# Download latest version
path = kagglehub.dataset_download("shantanudhakadd/email-spam-detection-dataset-classification")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "spam.csv")
df = pd.read_csv(csv_file_path, encoding='latin1')

df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)

print(df.head())

Files in dataset directory: ['spam.csv']
     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [374]:
df = prepare_dataset(df, "v1", "v2")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64
[0 1]
(5169, 2)


In [375]:
concat_with(df)

(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 97590      0  Rihanna is so beautiful and amazing ♥♥♥♥♥love ...
 97591      0                                         waka waka﻿
 97592      0             I hate it when Laura Bennett comes in﻿
 97593      1  Hey Music Fans I really appreciate any of you ...
 97594      0                    I could hear this for years ;3﻿
 
 [97595 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [376]:
# Download latest version
path = kagglehub.dataset_download("karthickveerakumar/spam-filter")

files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file_path = os.path.join(path, "emails.csv")
df = pd.read_csv(csv_file_path)

print(df.head())

Files in dataset directory: ['emails.csv']
                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [377]:
df = prepare_dataset(df, "spam", "text")

   label                                               text
0      1  Subject: naturally irresistible your corporate...
1      1  Subject: the stock trading gunslinger  fanny i...
2      1  Subject: unbelievable new homes made easy  im ...
3      1  Subject: 4 color printing special  request add...
4      1  Subject: do not have money , get software cds ...
label    0
text     0
dtype: int64
[1 0]
(5695, 2)


In [378]:
df['text'] = df['text'].str.replace(r'Subject:\s*', '', regex=True)

In [379]:
df = prepare_dataset(df, "label", "text")

   label                                               text
0      1  naturally irresistible your corporate identity...
1      1  the stock trading gunslinger  fanny is merrill...
2      1  unbelievable new homes made easy  im wanting t...
3      1  4 color printing special  request additional i...
4      1  do not have money , get software cds from here...
label    0
text     0
dtype: int64
[1 0]
(5695, 2)


In [380]:
concat_with(df)

(       label                                               text
 0          0  Got fujitsu, ibm, hp, toshiba... Got a lot of ...
 1          0  So how are you really. What are you up to. How...
 2          0                    I see a cup of coffee animation
 3          0      This pain couldn't have come at a worse time.
 4          0                             Also where's the piece
 ...      ...                                                ...
 97590      0  Rihanna is so beautiful and amazing ♥♥♥♥♥love ...
 97591      0                                         waka waka﻿
 97592      0             I hate it when Laura Bennett comes in﻿
 97593      1  Hey Music Fans I really appreciate any of you ...
 97594      0                    I could hear this for years ;3﻿
 
 [97595 rows x 2 columns],
        label                                               text
 0          0  Also sir, i sent you an email about how to log...
 1          0                   Are you free now?can i call n

In [381]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return text

    def to_lowercase(self, text):
        return text.lower()

    def remove_stopwords(self, text):
        words = text.split()
        return ' '.join([word for word in words if word not in self.stop_words])

    def lemmatize_text(self, text):
        words = text.split()
        return ' '.join([self.lemmatizer.lemmatize(word) for word in words])

    def preprocess_text(self, text):
        text = self.clean_text(text)
        text = self.to_lowercase(text)
        text = self.remove_stopwords(text)
        text = self.lemmatize_text(text)
        return text

textPreprocessor = TextPreprocessor()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [382]:
main_train_df['text'] = main_train_df['text'].apply(textPreprocessor.preprocess_text)
print(main_train_df)

       label                                               text
0          0       got fujitsu ibm hp toshiba got lot model say
1          0                                 really hows master
2          0                           see cup coffee animation
3          0                       pain couldnt come worse time
4          0                                  also wheres piece
...      ...                                                ...
97590      0  rihanna beautiful amazing love much forever ri...
97591      0                                          waka waka
97592      0                            hate laura bennett come
97593      1  hey music fan really appreciate take time read...
97594      0                                  could hear year 3

[97595 rows x 2 columns]


In [383]:
main_test_df['text'] = main_test_df['text'].apply(textPreprocessor.preprocess_text)
print(main_test_df)

       label                                               text
0          0  also sir sent email log usc payment portal ill...
1          0                                   free nowcan call
2          0  supervisor find 4 one lor thk student havent a...
3          0  shes good wondering wont say hi shes smiling c...
4          1     sorry missed call let talk time im 07090201529
...      ...                                                ...
24400      1         really ask nicely view vids subscribe back
24401      1  httptankionlinecomfriendcd92db3f4 great game c...
24402      1                           subscribe subscribe back
24403      0                                             nice 3
24404      0                                        eminem rock

[24405 rows x 2 columns]


In [384]:
from google.colab import files

main_train_df.to_csv('spam_text_train_dataset.csv', index=False)
files.download('spam_text_train_dataset.csv')

main_test_df.to_csv('spam_text_test_dataset.csv', index=False)
files.download('spam_text_test_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>