In [34]:
import os
import time
import argparse
import numpy as np
from tqdm import tqdm
import sklearn
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from utils import *
from models.CNN_model import CNN
from models.RNN_model import RNN

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def generate_dataset():
    ham = []
    spam = []
    root = os.path.join(os.getcwd(), 'dataset/Pre-processed')
    enron_list = ['enron1', 'enron2', 'enron3', 'enron4', 'enron5', 'enron6']
    for enron in enron_list:
        enron_file = os.path.join(root, enron)
        enron_ham_file = os.path.join(enron_file, 'ham')
        enron_spam_file = os.path.join(enron_file, 'spam')
        enron_ham_content = get_all_files(enron_ham_file)
        enron_spam_content = get_all_files(enron_spam_file)
        ham += enron_ham_content
        spam += enron_spam_content
    
    data = []
    label = []
    data += ham
    data += spam
    label += [0] * len(ham)
    label += [1] * len(spam)
    return data, label

In [3]:
# torch.manual_seed(args.seed)

data, label = generate_dataset()
x_train_valid, x_test, y_train_valid, y_test = train_test_split(data, label, test_size=0.2, shuffle=True, stratify=label)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_valid, y_train_valid, test_size=0.25, shuffle=True)

                                                     

In [4]:
import csv
from torchtext.legacy import data
from torchtext.vocab import Vectors
from utils import Tokenizer, SpecialTokens
from tempfile import NamedTemporaryFile

tokenizer = Tokenizer()
MAX_TRAIN_SENTENCE_LENGTH = 1500
MAX_VALID_SENTENCE_LENGTH = 1500
MAX_TEST_SENTENCE_LENGTH = 1500
BATCH_SIZE = 32

TEXT = data.Field(
    sequential=True, 
    tokenize=tokenizer, 
    lower=True, 
    fix_length=MAX_TRAIN_SENTENCE_LENGTH + 2, 
    pad_token=SpecialTokens.PAD, 
    unk_token=SpecialTokens.UNK, 
    init_token=SpecialTokens.SOS, 
    eos_token=SpecialTokens.EOS
)

# LABEL = data.Field(
#     sequential=True, 
#     tokenize=lambda x: [int(x[0])], 
#     use_vocab=False
# )
LABEL = data.LabelField(dtype=torch.long, use_vocab=False)

train_dataframe = pd.DataFrame(
    {
        'Content': x_train, 
        'Label': y_train
    }
)

valid_dataframe = pd.DataFrame(
    {
        'Content': x_valid, 
        'Label': y_valid
    }
)

test_dataframe = pd.DataFrame(
    {
        'Content': x_test, 
        'Label': y_test
    }
)

with NamedTemporaryFile(suffix='.tsv', mode='w+t', newline="") as f:
    train_dataframe.to_csv(f.name, sep='\t')
    train_data = data.TabularDataset(
        path=f.name, 
        format='tsv', 
        fields=[('ID', None), ('Content', TEXT), ('Label', LABEL)], 
        skip_header=True
    )

with NamedTemporaryFile(suffix='.tsv', mode='w+t', newline="") as f:
    valid_dataframe.to_csv(f.name, sep='\t')
    valid_data = data.TabularDataset(
        path=f.name, 
        format='tsv', 
        fields=[('ID', None), ('Content', TEXT), ('Label', LABEL)], 
        skip_header=True
    )

with NamedTemporaryFile(suffix='.tsv', mode='w+t', newline="") as f:
    test_dataframe.to_csv(f.name, sep='\t')
    test_data = data.TabularDataset(
        path=f.name, 
        format='tsv', 
        fields=[('ID', None), ('Content', TEXT), ('Label', LABEL)], 
        skip_header=True
    )

vectors = Vectors(name="Glove_pretrain/glove_w2v.txt")
TEXT.build_vocab(train_data, vectors=vectors)

train_iter = data.BucketIterator(
    dataset=train_data, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    sort_key=lambda x: len(x.Content), 
    sort_within_batch=False, 
    repeat=False
)

valid_iter = data.BucketIterator(
    dataset=valid_data, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    sort_key=lambda x: len(x.Content), 
    sort_within_batch=False, 
    repeat=False
)

test_iter = data.Iterator(
    dataset=test_data, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    sort=False, 
    sort_within_batch=False, 
    repeat=False
)

In [5]:
jobConfig = {
    'embedding_config':{
        # # random-embedding-parameters
        # 'embedding_method': 'random-embedding', 
        # 'num_embeddings': len(TEXT.vocab.itos), 
        # 'embedding_dim': 128, 
        # 'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]

        # glove-embedding-parameters:
        'embedding_method': 'pretrained-embedding', 
        'pretrained_vocab': TEXT.vocab, 
        'num_embeddings': len(TEXT.vocab.itos), 
        'embedding_dim': TEXT.vocab.vectors[0].shape[0], 
        'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]
    }, 

    'model_config':{
        'kernel_size': [2, 3, 4], 
        'feature_dim': 128, 
        'dropout_ratio': 0.3, 
        'max_sentence_length': MAX_TRAIN_SENTENCE_LENGTH, 
        'num_class': 5
    }, 

    'training_config':{
        'initialize_method': 'normal', 
        'num_epochs': 10000, 
        'lr': 0.01, 
    }
}

In [7]:
from trainer import Trainer
model = CNN(jobConfig=jobConfig)
trainer = Trainer(jobConfig['training_config'])
optimizer = optim.Adam(model.parameters())
loss_func = nn.CrossEntropyLoss()

trainer(model=model, train_iterator=train_iter, valid_iterator=valid_iter, optimizer=optimizer, loss_function=loss_func)

Loading pretraining vector...
Loading Model...
The model has 69833909 parameters.
Parameters Initializing...
Validation loss decrease (inf --> 0.040498).
Time:Jan_21_2023_22-00-29  Epoch:0
 Train_Loss:0.12519280164260535, Valid_Loss:0.04049793060017105
 Train_acc:0.9601599526066351, Valid_acc:0.9843009478672986

********************************************************************
Validation loss decrease (0.040498 --> 0.038429).
Validation loss decrease (0.038429 --> 0.035863).
Time:Jan_22_2023_13-44-13  Epoch:20
 Train_Loss:8.786106925770028e-08, Valid_Loss:0.06043856699420411
 Train_acc:1.0, Valid_acc:0.9882997630331753

********************************************************************
[31mEarly stopping[0m


In [9]:
jobConfig = {
    'embedding_config':{
        # # random-embedding-parameters
        # 'embedding_method': 'random-embedding', 
        # 'num_embeddings': len(TEXT.vocab.itos), 
        # 'embedding_dim': 128, 
        # 'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]

        # glove-embedding-parameters:
        'embedding_method': 'pretrained-embedding', 
        'pretrained_vocab': TEXT.vocab, 
        'num_embeddings': len(TEXT.vocab.itos), 
        'embedding_dim': TEXT.vocab.vectors[0].shape[0], 
        'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]
    }, 

    'model_config':{
        'rnn_type': 'lstm', 
        'hidden_size': 128, 
        'bidirectional': True, 
        'num_class': 5
    }, 

    'training_config':{
        'initialize_method': 'normal', 
        'num_epochs': 10000, 
        'lr': 0.01, 
        'patience': 10
    }
}

In [10]:
model = RNN(jobConfig=jobConfig)
trainer = Trainer(jobConfig['training_config'])
optimizer = optim.Adam(model.parameters())
loss_func = nn.CrossEntropyLoss()
trainer(model=model, train_iterator=train_iter, valid_iterator=valid_iter, optimizer=optimizer, loss_function=loss_func)

Loading pretraining vector...
Loading Model...
The model has 69927605 parameters.
Parameters Initializing...
Validation loss decrease (inf --> 0.042108).
Time:Jan_22_2023_16-33-26  Epoch:0
 Train_Loss:0.11944540325842068, Valid_Loss:0.042107587541184294
 Train_acc:0.9552231437598736, Valid_acc:0.9890402843601895

********************************************************************
[31mEarly stopping[0m


In [42]:
from trainer import *
jobConfig = {
    'embedding_config':{
        # # random-embedding-parameters
        # 'embedding_method': 'random-embedding', 
        # 'num_embeddings': len(TEXT.vocab.itos), 
        # 'embedding_dim': 128, 
        # 'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]

        # glove-embedding-parameters:
        'embedding_method': 'pretrained-embedding', 
        'pretrained_vocab': TEXT.vocab, 
        'num_embeddings': len(TEXT.vocab.itos), 
        'embedding_dim': TEXT.vocab.vectors[0].shape[0], 
        'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]
    }, 

    'model_config':{
        'kernel_size': [2, 3, 4], 
        'feature_dim': 128, 
        'dropout_ratio': 0.3, 
        'max_sentence_length': MAX_TRAIN_SENTENCE_LENGTH, 
        'num_class': 5
    }, 

    'training_config':{
        'initialize_method': 'normal', 
        'num_epochs': 10000, 
        'lr': 0.01, 
    }
}
prdeict_cnn_model = CNN(jobConfig=jobConfig)
prdeict_cnn_model.load_state_dict(torch.load('log/Jan_21_2023_21-23-29/model.pth'))
cnn_train_res = evaluate(prdeict_cnn_model, train_iter, loss_func, 'cpu')
cnn_valid_res = evaluate(prdeict_cnn_model, valid_iter, loss_func, 'cpu')
cnn_test_res = evaluate(prdeict_cnn_model, test_iter, loss_func, 'cpu')
print("Train: {} \nValid: {} \nTest: {}".format(cnn_train_res, cnn_valid_res, cnn_test_res))

Loading pretraining vector...
Train: (8.017990406818827e-05, 1.0) 
Valid: (0.03875423586696644, 0.9882997630331753) 
Test: (0.03301529467220595, 0.9881516587677726)


In [44]:
jobConfig = {
    'embedding_config':{
        # # random-embedding-parameters
        # 'embedding_method': 'random-embedding', 
        # 'num_embeddings': len(TEXT.vocab.itos), 
        # 'embedding_dim': 128, 
        # 'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]

        # glove-embedding-parameters:
        'embedding_method': 'pretrained-embedding', 
        'pretrained_vocab': TEXT.vocab, 
        'num_embeddings': len(TEXT.vocab.itos), 
        'embedding_dim': TEXT.vocab.vectors[0].shape[0], 
        'padding_idx': TEXT.vocab.stoi[SpecialTokens.PAD]
    }, 

    'model_config':{
        'rnn_type': 'lstm', 
        'hidden_size': 128, 
        'bidirectional': True, 
        'num_class': 5
    }, 

    'training_config':{
        'initialize_method': 'normal', 
        'num_epochs': 10000, 
        'lr': 0.01, 
    }
}
predict_rnn_model = RNN(jobConfig=jobConfig)
predict_rnn_model.load_state_dict(torch.load('log/Jan_22_2023_15-39-08/model.pth'))
rnn_train_res = evaluate(predict_rnn_model, train_iter, loss_func, 'cpu')
rnn_valid_res = evaluate(predict_rnn_model, valid_iter, loss_func, 'cpu')
rnn_test_res = evaluate(predict_rnn_model, test_iter, loss_func, 'cpu')
print("Train: {} \nValid: {} \nTest:{}".format(rnn_train_res, rnn_valid_res, rnn_test_res))

Loading pretraining vector...
Train: (0.010403522701329283, 0.9976303317535545) 	Valid: (0.04213461858966339, 0.9890402843601895) 	Test:(0.04103185606114037, 0.986818720379147)
