# Eda Notebook - Preprocessed Enron Data

This notebook is used for eda for the preprocessed enron data from the following sources:

[Paper](https://www2.aueb.gr/users/ion/docs/ceas2006_paper.pdf)

[Data](https://www2.aueb.gr/users/ion/data/enron-spam/)

In [38]:
import pandas as pd
import numpy as np
import io

In [39]:
data = pd.read_csv('data/spambase/spambase.data')

In [40]:
data.shape

(4600, 58)

In [41]:
data.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.21,0.06,0.0,0.0,0.0,0.0,0.0,0.15,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.64,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.0,0.69,0.0,0.0,0.42,0.0,0.0,0.0,0.63,0.0
0.64.1,0.5,0.71,0.0,0.0,0.0,0.0,0.0,0.46,0.77,0.0,0.25,0.34,0.0,1.42,0.42,0.0,0.0,0.55,0.0,0.0
0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.32,0.14,1.23,0.63,0.63,1.85,1.92,1.88,0.61,0.19,0.0,0.38,0.34,0.9,0.71,1.27,0.94,0.0,1.11,1.59,0.0
0.2,0.28,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.0,0.25,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.31,0.0
0.3,0.21,0.19,0.31,0.31,0.0,0.0,0.0,0.3,0.38,0.96,0.25,0.0,0.9,0.0,0.42,0.0,0.0,0.18,0.0,0.0
0.4,0.07,0.12,0.63,0.63,1.85,0.0,1.88,0.0,0.0,0.0,0.0,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0
0.5,0.0,0.64,0.31,0.31,0.0,0.0,0.0,0.92,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,0.0
0.6,0.94,0.25,0.63,0.63,0.0,0.64,0.0,0.76,0.0,1.92,0.0,0.0,0.9,0.71,1.27,0.0,0.0,0.0,0.0,0.0


In [42]:
def read_names_file(filename):
  """Reads the .names file and returns a list of feature names."""
  with io.open(filename, "r") as f:
    lines = f.readlines()
  feature_names = []
  for line in lines:
    line = line.strip()
    if line.startswith("#"):
      continue
    feature_names.append(line)
  return feature_names

if __name__ == "__main__":
  names = read_names_file("data/spambase/spambase.names")
  print(names)


['| SPAM E-MAIL DATABASE ATTRIBUTES (in .names format)', '|', '| 48 continuous real [0,100] attributes of type word_freq_WORD', '| = percentage of words in the e-mail that match WORD,', '| i.e. 100 * (number of times the WORD appears in the e-mail) /', '| total number of words in e-mail.  A "word" in this case is any', '| string of alphanumeric characters bounded by non-alphanumeric', '| characters or end-of-string.', '|', '| 6 continuous real [0,100] attributes of type char_freq_CHAR', '| = percentage of characters in the e-mail that match CHAR,', '| i.e. 100 * (number of CHAR occurences) / total characters in e-mail', '|', '| 1 continuous real [1,...] attribute of type capital_run_length_average', '| = average length of uninterrupted sequences of capital letters', '|', '| 1 continuous integer [1,...] attribute of type capital_run_length_longest', '| = length of longest uninterrupted sequence of capital letters', '|', '| 1 continuous integer [1,...] attribute of type capital_run_lengt

In [43]:
def read_names_file(filename):
  """Reads the .names file and returns a NumPy array of feature names."""
  with io.open(filename, "r") as f:
    lines = f.readlines()
  feature_names = []
  for line in lines:
    line = line.strip()
    if line.startswith("#"):
      continue
    feature_names.append(line)
  return np.array(feature_names)

if __name__ == "__main__":
  names = read_names_file("data/spambase/spambase.names")
  print(names)


['| SPAM E-MAIL DATABASE ATTRIBUTES (in .names format)' '|'
 '| 48 continuous real [0,100] attributes of type word_freq_WORD'
 '| = percentage of words in the e-mail that match WORD,'
 '| i.e. 100 * (number of times the WORD appears in the e-mail) /'
 '| total number of words in e-mail.  A "word" in this case is any'
 '| string of alphanumeric characters bounded by non-alphanumeric'
 '| characters or end-of-string.' '|'
 '| 6 continuous real [0,100] attributes of type char_freq_CHAR'
 '| = percentage of characters in the e-mail that match CHAR,'
 '| i.e. 100 * (number of CHAR occurences) / total characters in e-mail'
 '|'
 '| 1 continuous real [1,...] attribute of type capital_run_length_average'
 '| = average length of uninterrupted sequences of capital letters' '|'
 '| 1 continuous integer [1,...] attribute of type capital_run_length_longest'
 '| = length of longest uninterrupted sequence of capital letters' '|'
 '| 1 continuous integer [1,...] attribute of type capital_run_length_to

In [44]:
def read_names_file(filename):
  """Reads the .names file and returns a NumPy array of feature names with an index."""
  with io.open(filename, "r") as f:
    lines = f.readlines()
  feature_names = []
  for line in lines:
    line = line.strip()
    if line.startswith("#"):
      continue
    feature_names.append(line)
  index = np.arange(len(feature_names))
  names = np.column_stack((index, feature_names))
  return names

if __name__ == "__main__":
  names = read_names_file("data/spambase/spambase.names")
  print(names)


[['0' '| SPAM E-MAIL DATABASE ATTRIBUTES (in .names format)']
 ['1' '|']
 ['2' '| 48 continuous real [0,100] attributes of type word_freq_WORD']
 ['3' '| = percentage of words in the e-mail that match WORD,']
 ['4' '| i.e. 100 * (number of times the WORD appears in the e-mail) /']
 ['5' '| total number of words in e-mail.  A "word" in this case is any']
 ['6' '| string of alphanumeric characters bounded by non-alphanumeric']
 ['7' '| characters or end-of-string.']
 ['8' '|']
 ['9' '| 6 continuous real [0,100] attributes of type char_freq_CHAR']
 ['10' '| = percentage of characters in the e-mail that match CHAR,']
 ['11'
  '| i.e. 100 * (number of CHAR occurences) / total characters in e-mail']
 ['12' '|']
 ['13'
  '| 1 continuous real [1,...] attribute of type capital_run_length_average']
 ['14' '| = average length of uninterrupted sequences of capital letters']
 ['15' '|']
 ['16'
  '| 1 continuous integer [1,...] attribute of type capital_run_length_longest']
 ['17' '| = length of lon

In [45]:
def read_names_file(filename):
  """Reads the .names file and returns a list of feature names."""
  with io.open(filename, "r") as f:
    lines = f.readlines()
  feature_names = []
  for line in lines:
    line = line.strip()
    if line.startswith("#"):
      continue
    feature_names.append(line)
  return feature_names

def relate_data_to_names(data, names):
  """Relates the .data file to the .names file by setting the column names of the DataFrame."""
  data.columns = names[32:]
  return data

if __name__ == "__main__":
  # data = read_data_file("data/spambase/spambase.data")
  names = read_names_file("data/spambase/spambase.names")
  data = relate_data_to_names(data, names)
  # print(data.head())


In [46]:
data.head().T

Unnamed: 0,0,1,2,3,4
,0.21,0.06,0.0,0.0,0.0
word_freq_make: continuous.,0.28,0.0,0.0,0.0,0.0
word_freq_address: continuous.,0.5,0.71,0.0,0.0,0.0
word_freq_all: continuous.,0.0,0.0,0.0,0.0,0.0
word_freq_3d: continuous.,0.14,1.23,0.63,0.63,1.85
word_freq_our: continuous.,0.28,0.19,0.0,0.0,0.0
word_freq_over: continuous.,0.21,0.19,0.31,0.31,0.0
word_freq_remove: continuous.,0.07,0.12,0.63,0.63,1.85
word_freq_internet: continuous.,0.0,0.64,0.31,0.31,0.0
word_freq_order: continuous.,0.94,0.25,0.63,0.63,0.0


In [47]:
# shift the names of the data dataframe up by one, and rename the last column to "spam"]
data.columns = np.append(names[33:], ["spam"])

In [48]:
data.head().T

Unnamed: 0,0,1,2,3,4
word_freq_make: continuous.,0.21,0.06,0.0,0.0,0.0
word_freq_address: continuous.,0.28,0.0,0.0,0.0,0.0
word_freq_all: continuous.,0.5,0.71,0.0,0.0,0.0
word_freq_3d: continuous.,0.0,0.0,0.0,0.0,0.0
word_freq_our: continuous.,0.14,1.23,0.63,0.63,1.85
word_freq_over: continuous.,0.28,0.19,0.0,0.0,0.0
word_freq_remove: continuous.,0.21,0.19,0.31,0.31,0.0
word_freq_internet: continuous.,0.07,0.12,0.63,0.63,1.85
word_freq_order: continuous.,0.0,0.64,0.31,0.31,0.0
word_freq_mail: continuous.,0.94,0.25,0.63,0.63,0.0


In [49]:
data.shape

(4600, 58)

In [50]:
# export data to csv named data_cleaned.csv
data.to_csv("data/data_cleaned_test.csv", index=False)

In [51]:
# print out the value_counts of the last column of data
data.iloc[:, -1].value_counts()

0    2788
1    1812
Name: spam, dtype: int64

In [52]:
# remove the word "continuous" from the data dataframe and remove any leftover whitespace
data.columns = data.columns.str.replace("continuous", "").str.strip()

In [53]:
# remove ':' and '.' from the data dataframe
data.columns = data.columns.str.replace(":", "").str.replace(".", "")

In [54]:
data.head().T

Unnamed: 0,0,1,2,3,4
word_freq_make,0.21,0.06,0.0,0.0,0.0
word_freq_address,0.28,0.0,0.0,0.0,0.0
word_freq_all,0.5,0.71,0.0,0.0,0.0
word_freq_3d,0.0,0.0,0.0,0.0,0.0
word_freq_our,0.14,1.23,0.63,0.63,1.85
word_freq_over,0.28,0.19,0.0,0.0,0.0
word_freq_remove,0.21,0.19,0.31,0.31,0.0
word_freq_internet,0.07,0.12,0.63,0.63,1.85
word_freq_order,0.0,0.64,0.31,0.31,0.0
word_freq_mail,0.94,0.25,0.63,0.63,0.0


In [55]:
# export data to csv named data_cleaned.csv
data.to_csv("data/data_cleaned.csv", index=True)

In [56]:
data

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
1,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
2,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.0,0.000,0.000,0.000,3.000,15,54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4596,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4597,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4598,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0
