# Spam Classifier: Preparing data

In [1]:
import os 

In [2]:
try:
    ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
except Exception:
    ROOT_DIR = os.path.abspath('..')

In [3]:
data_path = os.path.join(ROOT_DIR, 'data')

data_folder_names = [os.path.join(data_path, file_name) for file_name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, file_name))]
data_folder_names

['d:\\repos\\spam-classifier\\data\\easy_ham',
 'd:\\repos\\spam-classifier\\data\\easy_ham_2',
 'd:\\repos\\spam-classifier\\data\\hard_ham',
 'd:\\repos\\spam-classifier\\data\\spam',
 'd:\\repos\\spam-classifier\\data\\spam_2']

In [4]:
import email
import email.policy

In [5]:
def get_files(folder_path: str) -> iter:
    return (file for file in os.listdir(folder_path))

In [6]:
def retrieve_email(file_path: str) -> 'bytes':
    with open(file_path, 'rb') as file:
        return email.parser.BytesParser(policy= email.policy.default).parse(file)

In [10]:
from concurrent import futures

In [19]:
def load_one_data(data_folder_path: str) -> list:
    return [retrieve_email(os.path.join(data_folder_path, file)) for file in get_files(data_folder_path)]

In [7]:
def load_data(data_folder_path: list) -> dict:
    emails = {}
    for folder_path in data_folder_path:
        folder_name = os.path.basename(folder_path)
        emails[folder_name] = [retrieve_email(os.path.join(folder_path, file)) for file in get_files(folder_path)]
    
    return emails

In [15]:
def load_data_parallel(data_folder_path: list) ->dict:
    emails = {}
    with futures.ThreadPoolExecutor(len(data_folder_path)) as executor:
        res = executor.map(load_one_data, data_folder_path)
    
    for folder_path, mails in zip(data_folder_path, res):
        folder_name = os.path.basename(folder_path)
        emails[folder_name] = mails

    return emails


In [20]:
emails = load_data_parallel(data_folder_names)
emails

{'easy_ham': [<email.message.EmailMessage at 0x24165426fa0>,
  <email.message.EmailMessage at 0x24165426400>,
  <email.message.EmailMessage at 0x24165422d00>,
  <email.message.EmailMessage at 0x241654228b0>,
  <email.message.EmailMessage at 0x241654263d0>,
  <email.message.EmailMessage at 0x241654269a0>,
  <email.message.EmailMessage at 0x24165426d60>,
  <email.message.EmailMessage at 0x241654225b0>,
  <email.message.EmailMessage at 0x24165426340>,
  <email.message.EmailMessage at 0x24165426b50>,
  <email.message.EmailMessage at 0x24165426640>,
  <email.message.EmailMessage at 0x24165426820>,
  <email.message.EmailMessage at 0x2416542a850>,
  <email.message.EmailMessage at 0x2416542a070>,
  <email.message.EmailMessage at 0x2416542a670>,
  <email.message.EmailMessage at 0x241654261f0>,
  <email.message.EmailMessage at 0x24165422ca0>,
  <email.message.EmailMessage at 0x2416542a9d0>,
  <email.message.EmailMessage at 0x24165422220>,
  <email.message.EmailMessage at 0x2416542a7c0>,
  <email

In [8]:
# sequential
# emails = load_data(data_folder_names)
# emails

{'easy_ham': [<email.message.EmailMessage at 0x24151c0e910>,
  <email.message.EmailMessage at 0x241614ff3a0>,
  <email.message.EmailMessage at 0x241614ff250>,
  <email.message.EmailMessage at 0x241614ff370>,
  <email.message.EmailMessage at 0x241614ff580>,
  <email.message.EmailMessage at 0x241614ff1c0>,
  <email.message.EmailMessage at 0x241614ffc10>,
  <email.message.EmailMessage at 0x241614ff130>,
  <email.message.EmailMessage at 0x241614ff880>,
  <email.message.EmailMessage at 0x241614ff0d0>,
  <email.message.EmailMessage at 0x241614ff820>,
  <email.message.EmailMessage at 0x241614ffcd0>,
  <email.message.EmailMessage at 0x241614ff790>,
  <email.message.EmailMessage at 0x241614ff460>,
  <email.message.EmailMessage at 0x241614ff2e0>,
  <email.message.EmailMessage at 0x241614ff910>,
  <email.message.EmailMessage at 0x241614fffa0>,
  <email.message.EmailMessage at 0x241614ff610>,
  <email.message.EmailMessage at 0x241614ff7c0>,
  <email.message.EmailMessage at 0x241614ff9d0>,
  <email

In [21]:
print(emails['easy_ham'][1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/
