In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression


In [None]:
trec = pd.read_csv('TREC2007.csv').drop(columns=['subject', 'email_to', 'email_from']).dropna()

spam_ham = pd.read_csv('spam_ham_dataset.csv').drop(columns=['label', 'Unnamed: 0']).dropna()
spam_ham = spam_ham.rename(columns={"text": "message", "label_num": "label"})

spam = pd.read_csv('spam.csv', encoding='latin-1').drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']).dropna()
spam = spam.rename(columns={"v1": "label", "v2": "message"})
spam['label'] = spam['label'].map({'spam': 1, 'ham': 0})

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message']).dropna()
sms_spam['label'] = sms_spam['label'].map({'spam': 1, 'ham': 0})

enron = pd.read_csv('enron.csv').dropna()
enron['label'] = enron['label'].map({'spam': 1, 'ham': 0})

gpt = pd.read_csv('TREC2007.csv').drop(columns=['subject', 'email_to', 'email_from']).dropna()


data = pd.concat([trec, spam_ham, spam, sms_spam, enron], ignore_index=True)
gptdata = pd.concat([data, gpt])
# X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.1, random_state=42)
label = data['label']
# gptdata.head()


Unnamed: 0,label,message
0,1,Content-Type: text/html;\nContent-Transfer-Enc...
1,0,"Hi, i've just updated from the gulus and I che..."
2,1,"Content-Type: text/plain;\n\tcharset=""iso-8859..."
3,1,"Hey Billy, \n\nit was really fun going out the..."
4,1,Content-Type: multipart/alternative;\n ...


In [None]:
count_vectorizer = CountVectorizer()
TF_vectorizer = count_vectorizer.fit_transform(data['message'])

tfidf_vectorizer = TfidfVectorizer()
TFIDF_vectorizer = tfidf_vectorizer.fit_transform(data['message'])

NB_TF_class = MultinomialNB()
NB_TF_class.fit(TF_vectorizer, label)

NB_TFIDF_class = MultinomialNB()
NB_TFIDF_class.fit(TFIDF_vectorizer, label)

reg_TF_classifier = LogisticRegression(solver='sag')
reg_TF_classifier.fit(TF_vectorizer, label)

reg_TFIDF_classifier = LogisticRegression(solver='sag')
reg_TFIDF_classifier.fit(TFIDF_vectorizer, label)

rskf = RepeatedStratifiedKFold(random_state=40, n_repeats=3, n_splits=5)
NB_TF_scores = cross_val_score(NB_TF_class, TF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')
NB_TFIDF_scores = cross_val_score(NB_TFIDF_class, TFIDF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')
reg_TF_scores = cross_val_score(reg_TF_classifier, TF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')
reg_TFIDF_scores = cross_val_score(reg_TFIDF_classifier, TFIDF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')



In [None]:
label = gptdata['label']

gptcount_vectorizer = CountVectorizer()
gptTF_vectorizer = gptcount_vectorizer.fit_transform(gptdata['message'])

gpttfidf_vectorizer = TfidfVectorizer()
gptTFIDF_vectorizer = gpttfidf_vectorizer.fit_transform(gptdata['message'])

gptNB_TF_class = MultinomialNB()
gptNB_TF_class.fit(gptTF_vectorizer, label)

gptNB_TFIDF_class = MultinomialNB()
gptNB_TFIDF_class.fit(gptTFIDF_vectorizer, label)

gptreg_TF_classifier = LogisticRegression(solver='sag')
gptreg_TF_classifier.fit(gptTF_vectorizer, label)

gptreg_TFIDF_classifier = LogisticRegression(solver='sag')
gptreg_TFIDF_classifier.fit(gptTFIDF_vectorizer, label)

# rskf = RepeatedStratifiedKFold(random_state=40, n_repeats=3, n_splits=5)
gptNB_TF_scores = cross_val_score(gptNB_TF_class, gptTF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')
gptNB_TFIDF_scores = cross_val_score(gptNB_TFIDF_class, gptTFIDF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')
gptreg_TF_scores = cross_val_score(gptreg_TF_classifier, gptTF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')
gptreg_TFIDF_scores = cross_val_score(gptreg_TFIDF_classifier, gptTFIDF_vectorizer, label, cv=rskf, scoring='balanced_accuracy')



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert scores to a pandas DataFrame
results = pd.DataFrame({
    'TF naive Bayes': NB_TF_scores,
    'gptTF naive Bayes': gptNB_TF_scores,
    'TF-IDF naive Bayes': NB_TFIDF_scores,
    'gptTF-IDF naive Bayes': gptNB_TFIDF_scores,
    'TF logistic regression': reg_TF_scores,
    'gptTF logistic regression': gptreg_TF_scores,
    'TF-IDF logistic regression': reg_TFIDF_scores,
    'gptTF-IDF logistic regression': gptreg_TFIDF_scores
})

# Summary table
mean_scores = results.mean()
std_scores = results.std()
summary_table = pd.DataFrame({'Mean Accuracy': mean_scores, 'Standard Deviation': std_scores})
print("Summary of Results:")
print(summary_table)

# Visualization
plt.figure(figsize=(10, 5))
sns.boxplot(data=results)
# plt.title('Comparison of Model Accuracies (without chat GPT dataset)')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.grid(True)
plt.show()



AttributeError: module 'matplotlib' has no attribute 'get_data_path'

In [None]:
from scipy import stats

print(stats.ttest_ind(NB_TF_scores, gptNB_TF_scores))
print(stats.ttest_ind(NB_TFIDF_scores, gptNB_TFIDF_scores))
print(stats.ttest_ind(reg_TF_scores, gptreg_TF_scores))
print(stats.ttest_ind(reg_TFIDF_scores, gptreg_TFIDF_scores))


TtestResult(statistic=-25.51564823761338, pvalue=6.260521152971419e-21, df=28.0)
TtestResult(statistic=0.8145790408855447, pvalue=0.42218773803594567, df=28.0)
TtestResult(statistic=-0.8068359678765069, pvalue=0.4265616299959213, df=28.0)
TtestResult(statistic=-11.957609871085092, pvalue=1.6193671032393531e-12, df=28.0)


In [None]:
import os


combined_df = pd.DataFrame()

temp_dir = 'enron/enron1'
for folder_name in os.listdir(temp_dir):
        folder_path = os.path.join(temp_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    # Assuming each file is a CSV with a header
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        file_content = file.read()
                    temp_df = pd.DataFrame({'message': [file_content], 'label': [folder_name]})
                    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

temp_dir = 'enron/enron2'
for folder_name in os.listdir(temp_dir):
        folder_path = os.path.join(temp_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    # Assuming each file is a CSV with a header
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        file_content = file.read()
                    temp_df = pd.DataFrame({'message': [file_content], 'label': [folder_name]})
                    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
temp_dir = 'enron/enron3'
for folder_name in os.listdir(temp_dir):
        folder_path = os.path.join(temp_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    # Assuming each file is a CSV with a header
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        file_content = file.read()
                    temp_df = pd.DataFrame({'message': [file_content], 'label': [folder_name]})
                    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
temp_dir = 'enron/enron4'
for folder_name in os.listdir(temp_dir):
        folder_path = os.path.join(temp_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    # Assuming each file is a CSV with a header
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        file_content = file.read()
                    temp_df = pd.DataFrame({'message': [file_content], 'label': [folder_name]})
                    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
temp_dir = 'enron/enron5'
for folder_name in os.listdir(temp_dir):
        folder_path = os.path.join(temp_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    # Assuming each file is a CSV with a header
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        file_content = file.read()
                    temp_df = pd.DataFrame({'message': [file_content], 'label': [folder_name]})
                    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
temp_dir = 'enron/enron6'
for folder_name in os.listdir(temp_dir):
        folder_path = os.path.join(temp_dir, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    # Assuming each file is a CSV with a header
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        file_content = file.read()
                    temp_df = pd.DataFrame({'message': [file_content], 'label': [folder_name]})
                    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

combined_df.to_csv('enron.csv', index=False)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'enron/enron1'

In [None]:
print("TF")
# print("Cross Validation Scores: \n", tfscores)
print("Accuracy: %0.2f (+/- %0.2f)" % (NB_TF_scores.mean(), NB_TF_scores.std() * 2))
print("Number of CV Scores used in Average: ", len(NB_TF_scores))

print("\nTFIDF")
# print("\nCross Validation Scores: \n", tfidfscores)
print("Accuracy: %0.2f (+/- %0.2f)" % (NB_TFIDF_scores.mean(), NB_TFIDF_scores.std() * 2))
print("Number of CV Scores used in Average: ", len(NB_TFIDF_scores))

print("\nRegresja")
# print("Cross Validation Scores: \n", tfscores)
print("Accuracy: %0.2f (+/- %0.2f)" % (reg_TFIDF_scores.mean(), reg_TFIDF_scores.std() * 2))
print("Number of CV Scores used in Average: ", len(reg_TFIDF_scores))



TF
Accuracy: 0.91 (+/- 0.00)
Number of CV Scores used in Average:  15

TFIDF
Accuracy: 0.96 (+/- 0.00)
Number of CV Scores used in Average:  15

Regresja
Accuracy: 0.98 (+/- 0.00)
Number of CV Scores used in Average:  15
