In [62]:
# import packages
import pandas as pd # manipulating the data purpose
import csv
import os
import numpy as np
from sklearn.model_selection import train_test_split # Split test/train data randomly
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.naive_bayes import MultinomialNB # add classifier for classification of words

In [63]:
def convert_src_to_csv(input_directory, output_csv, spam_classification_number):
    # List to hold all rows of data
    data_rows = []

    # Iterate over all files in the input directory
    for filename in os.listdir(input_directory):
        file_path = os.path.join(input_directory, filename)
        
        # Only process if it's a file
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                file_content = file.read().strip()
                # Replace newlines with spaces to have one constant text
                file_content_single_line = file_content.replace('\n', ' ')
                file_content_no_sign = file_content_single_line.replace(',', ' ')
                # Append file content as a row in data_rows
                data_rows.append([file_content_no_sign, spam_classification_number])

    # Write all data to the CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['text', 'target'])  # Write header
        writer.writerows(data_rows)

In [64]:
# Define input directory containing the files and output CSV file path
input_directory_non_spam = './srcdata/easy_ham'
input_directory_spam = './srcdata/spam'
output_csv__spam = 'output_emails_non_spam.csv'
output_csv_spam = 'output_emails_spam.csv'

# Call the function to process files and create CSV for NON SPAM data (HAM)
convert_src_to_csv(input_directory_non_spam, output_csv_non_spam, 0)

# Call the function to process files and create CSV for SPAM data (HAM)
convert_src_to_csv(input_directory_spam , output_csv_spam, 1)

In [65]:
# import data
file_spam = pd.read_csv("output_emails_spam.csv")
file_ham = pd.read_csv("output_emails_non_spam.csv")
spam_df = pd.concat([file_spam, file_ham], ignore_index=True) # merging spam and ham

In [66]:
# inspect data
spam_df.head()

Unnamed: 0,text,target
0,mv 1 00001.bfc8d64d12b325ff385cca8d07b84288 mv...,1
1,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,1
2,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1
3,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1
4,From wsup@playful.com Thu Aug 22 16:17:00 200...,1


In [67]:
# inspect data
spam_df.shape

(3052, 2)

In [68]:
# goup by category. 0 - no spam, 1 - spam
spam_df.groupby('target').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2551,2550,From razor-users-admin@lists.sourceforge.net ...,2
1,501,497,Received: from b.smtp-out.sonic.net (b.smtp-ou...,2


In [69]:
# create train/test split w proporcji 25%
x_train, x_test, y_train, y_test = train_test_split(spam_df.text, spam_df.target, test_size = 0.25)

In [70]:
# check training data
x_train.head()

2669    From rssfeeds@jmason.org  Mon Sep 30 13:43:45 ...
1074    From fork-admin@xent.com  Mon Sep  9 19:27:30 ...
1063    From fork-admin@xent.com  Mon Sep  9 10:46:24 ...
1852    From rpm-list-admin@freshrpms.net  Mon Oct  7 ...
2200    From razor-users-admin@lists.sourceforge.net  ...
Name: text, dtype: object

In [71]:
# check training data
x_train.describe()

count                                                  2289
unique                                                 2286
top       Received: from b.smtp-out.sonic.net (b.smtp-ou...
freq                                                      2
Name: text, dtype: object

In [72]:
# find key words and store data as matrix of words
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values) # all text transforming to matrix of words

In [73]:
# see matrix shape
x_train_count

<2289x65036 sparse matrix of type '<class 'numpy.int64'>'
	with 576786 stored elements in Compressed Sparse Row format>

In [74]:
x_train_count.toarray()

array([[1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 2, ..., 0, 0, 0],
       [3, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [75]:
# train model basis on word matrix and given information spam - 1 , not spam - 0
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [76]:
# test trained model on my custom NON SPAM email message
email_ham = ["hey lets have a beer together today. Are you in?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [77]:
# test trained model on my custom SPAM email message
email_spam = ["reward money click now you will be reach"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [78]:
# test model accuracy
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9803407601572739