# Spam Classifier

In [27]:
import numpy
import pandas 
import matplotlib.pyplot as plt 
import seaborn as sns
import os
import requests
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup

## Dowload the data

### Get filenames

In [28]:
url = "https://spamassassin.apache.org/old/publiccorpus/"

In [29]:
def get_file_names(url:str) -> iter:
    response = urlopen(url)
    soup_wrap = BeautifulSoup(response.read())
    return (file.contents[0] for file in soup_wrap.find_all('a') if '.' in file.contents[0])


In [30]:
file_names = list(get_file_names(url))
file_names

['20021010_easy_ham.tar.bz2',
 '20021010_hard_ham.tar.bz2',
 '20021010_spam.tar.bz2',
 '20030228_easy_ham.tar.bz2',
 '20030228_easy_ham_2.tar.bz2',
 '20030228_hard_ham.tar.bz2',
 '20030228_spam.tar.bz2',
 '20030228_spam_2.tar.bz2',
 '20050311_spam_2.tar.bz2',
 'readme.html']

In [31]:
import collections
types = ['easy', 'hard', 'spam']


In [34]:
def get_order_files_per_type(types:list, file_names: list) -> tuple :
    files = collections.defaultdict(list)
    other_files = []

    for file in file_names:
        if any((one_type in file for one_type in types)):
            for allowed_type in types:
                if allowed_type in file:
                    files[allowed_type].append(file)
                    break
        else:
            other_files.append(file)


    return other_files, files


In [36]:
other_files, files = get_order_files_per_type(types, file_names)
other_files, files

(['readme.html'],
 defaultdict(list,
             {'easy': ['20021010_easy_ham.tar.bz2',
               '20030228_easy_ham.tar.bz2',
               '20030228_easy_ham_2.tar.bz2'],
              'hard': ['20021010_hard_ham.tar.bz2',
               '20030228_hard_ham.tar.bz2'],
              'spam': ['20021010_spam.tar.bz2',
               '20030228_spam.tar.bz2',
               '20030228_spam_2.tar.bz2',
               '20050311_spam_2.tar.bz2']}))

### Get file names updated

In [37]:
def get_updated_file_names(types: list, files:dict) -> dict :
    
    files_updated = collections.defaultdict(list)

    for allowed_type in types:
        for file in files[allowed_type]:
            file_part = file.split('_')
            if '2.' in file_part[-1]:
                files_updated[allowed_type+'_2'] = file
            else:
                if len(files_updated[allowed_type]) > 0:
                    date_prev = int(files_updated[allowed_type].split('_')[0])
                    date_actual = int(file_part[0])
                    if date_actual > date_prev:
                        files_updated[allowed_type] = file
                else:
                    files_updated[allowed_type] = file


    return files_updated


files_updated = get_updated_file_names(types, files)

### Create data location

In [38]:
import os
import pathlib

In [47]:
data_path = os.path.abspath('../data')

if not os.path.exists(data_path):
    os.makedirs(data_path)

### Download the files selected

In [52]:
import tarfile

In [57]:
def download_file(url_file, dest_path):
    r = requests.get(url_file)
    file_name = os.path.basename(url_file)
    dest_pathfile = os.path.join(dest_path, file_name)
    with open(dest_pathfile, 'wb') as f:
        f.write(r.content)
    
    file = tarfile.open(dest_pathfile)
    file.extractall(os.path.join(dest_path))


In [58]:
# Sequential
for file in files_updated.values():
    url_file = os.path.join(url, file)
    download_file(url_file, data_path)
