# Header Import

In [None]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
import glob
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
emails, labels = [], []

# Data Load

In [3]:
file_path = 'enron1/spam/'
for filename in glob.glob(os.path.join(file_path, '*.txt')):
    with open(filename, 'r', encoding = "ISO-8859-1") as infile:
        emails.append(infile.read())
        labels.append(1)

file_path = 'enron1/ham/'
for filename in glob.glob(os.path.join(file_path, '*.txt')):
    with open(filename, 'r', encoding = "ISO-8859-1") as infile:
        emails.append(infile.read())
        labels.append(0)

# Name Data Load

In [4]:
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

# Data Clean Code Define

In [5]:
def letters_only(astr):
    for c in astr:
        if not c.isalpha():
            return False
    return True

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower())
                                      for word in doc.split()
                                      if letters_only(word)
                                      and word not in all_names]))
    return cleaned_docs

# Data Cleaning

In [6]:
cleaned_emails = clean_text(emails)

# K-Fold

In [7]:
from sklearn.model_selection import StratifiedKFold
k = 10
k_fold = StratifiedKFold(n_splits=k)
# convert to numpy array for more efficient slicing
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)

# Smoothing Define

In [8]:
smoothing_factor_option = [1.0, 2.0, 3.0, 4.0, 5.0]
from collections import defaultdict
auc_record = defaultdict(float)

# Apply K-Fold and Smoothing

## TfidfVectorizer
- sublinear_tf : 빈도에 Log 함수를 씌워서 Smoothing
- max_df : 문서의 빈도가 너무 높은 경우 상한선
- stop_words : 불용어 사전
- max_features : 해당 값 만큼의 단어를 선택

In [9]:
for train_indices, test_indices in k_fold.split(cleaned_emails, labels):
    X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
    Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]

    tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)
    term_docs_train = tfidf_vectorizer.fit_transform(X_train)
    term_docs_test = tfidf_vectorizer.transform(X_test)
    for smoothing_factor in smoothing_factor_option:
        clf = MultinomialNB(alpha=smoothing_factor, fit_prior=True)
        clf.fit(term_docs_train, Y_train)
        prediction_prob = clf.predict_proba(term_docs_test)
        pos_prob = prediction_prob[:, 1]
        auc = roc_auc_score(Y_test, pos_prob)
        auc_record[smoothing_factor] += auc

# Metric

In [10]:
print(auc_record)

print('max features  smoothing  fit prior  auc')
for smoothing, smoothing_record in auc_record.items():
    print('       8000      {0}      true    {1:.4f}'.format(smoothing, smoothing_record/k))

defaultdict(<class 'float'>, {1.0: 9.936995073648463, 2.0: 9.945108942463373, 3.0: 9.950318435809343, 4.0: 9.953585278205583, 5.0: 9.956125705880032})
max features  smoothing  fit prior  auc
       8000      1.0      true    0.9937
       8000      2.0      true    0.9945
       8000      3.0      true    0.9950
       8000      4.0      true    0.9954
       8000      5.0      true    0.9956


# Metric Result
- TF-idf 를 적용하지 않은 roc-auc 의 결과 0.9856 의 결과보다 높은 0.9956이 나왔다.