## Data Pre-process

### Description

This script aggregates raw datasets into a unified data frame. The data frame contains two columns: 

Column 1: **label**, spam indicator (0 = not spam; 1 = spam)

Column 2: **feature**, email 

### Terms

**ham**: emails marked as non-spam

**spam**: spam emails

In [1]:
import pandas as pd
import csv
import glob
import re
import os

In [2]:
name_of_raw_hams = [name for name in glob.glob('../data/raw/ham/*.txt')]
name_of_raw_spams = [name for name in glob.glob('../data/raw/spam/*.txt')]
list_of_csv_ham = []
list_of_csv_spam = []

In [3]:
# Combine ham emails to list_of_csv_ham
for ham in name_of_raw_hams:
    ham_content = pd.read_csv(ham, sep='delimiter', header=None,encoding='latin-1')
    list_of_csv_ham.append(ham_content)
# Combine spam emails to list_of_csv_spam
for spam in name_of_raw_spams:
    spam_content = pd.read_csv(spam, sep='delimiter', header=None,encoding='latin-1')
    list_of_csv_spam.append(spam_content)
# Convert to data frames
df_hams = pd.concat(list_of_csv_ham).reset_index(drop=True)
df_spams = pd.concat(list_of_csv_spam).reset_index(drop=True)
# Add the label column
df_hams['y'] = [0] * len(df_hams)
df_spams['y'] = [1] * len(df_spams)

  return func(*args, **kwargs)


In [4]:
# Update column names
col_name = {0: 'X', 'y': 'y'}
df_hams = df_hams.rename(columns=col_name)
df_spams = df_spams.rename(columns=col_name)

In [5]:
# Concatenate to a full dataset
df_emails = pd.concat([df_hams, df_spams], axis=0)

In [6]:
# Save as csv files
df_hams.to_csv('../data/ham_emails.csv', index=False)
df_spams.to_csv('../data/spam_emails.csv', index=False)
df_emails.to_csv('../data/emails.csv', index=False)