# Preparations

In [1]:
import json
import os

## Mount Google Drive with raw data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! ls /content/drive/MyDrive/Colab\ Notebooks/Spam_Detector/Enron_raw_data/dataset

test  train  train.csv


## Install this project package from github

In [4]:
!rm -rf /content/spam_detector/
!git clone https://github.com/NataliaTarasovaNatoshir/spam_detector.git
%cd spam_detector/
!git pull origin master
!python setup.py install

Cloning into 'spam_detector'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 63 (delta 29), reused 47 (delta 13), pack-reused 0[K
Unpacking objects: 100% (63/63), done.
/content/spam_detector
From https://github.com/NataliaTarasovaNatoshir/spam_detector
 * branch            master     -> FETCH_HEAD
Already up to date.
  "details." % version
running install
running bdist_egg
running egg_info
creating spam_detector.egg-info
writing spam_detector.egg-info/PKG-INFO
writing dependency_links to spam_detector.egg-info/dependency_links.txt
writing top-level names to spam_detector.egg-info/top_level.txt
writing manifest file 'spam_detector.egg-info/SOURCES.txt'
writing manifest file 'spam_detector.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build
creating build/lib
creating build/lib/spam_detector
copying

In [5]:
# load config from package
with open("/content/spam_detector/spam_detector/config.json") as file:
  config = json.load(file)
config

{'dataset_build': {'raw_files_folder': '/content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/raw_files',
  'res_dataset_folder_name': '/content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset',
  'test_share': 0.3}}

# Overview

In [6]:
dataset_folder = config['dataset_build']['res_dataset_folder_name']

In [7]:
ham_test_cnt = len(os.listdir(os.path.join(dataset_folder, 'test', 'ham')))
spam_test_cnt = len(os.listdir(os.path.join(dataset_folder, 'test', 'spam')))
ham_train_cnt = len(os.listdir(os.path.join(dataset_folder, 'train', 'ham')))
spam_train_cnt = len(os.listdir(os.path.join(dataset_folder, 'train', 'spam')))
print('Resulting statistics')
print('Total number of files = {0}. Train: {1}, Test: {2} ({3:.0f}%)'.format(
    ham_test_cnt + spam_test_cnt + ham_train_cnt + spam_train_cnt, ham_train_cnt + spam_train_cnt,
    ham_test_cnt + spam_test_cnt,
    100 * (ham_test_cnt + spam_test_cnt) / (ham_test_cnt + spam_test_cnt + ham_train_cnt + spam_train_cnt)))
print("Train. Ham: {0}, Spam: {1} ({2:.0f}%)".format(ham_train_cnt, spam_train_cnt,
                                                      100 * spam_train_cnt / (ham_train_cnt + spam_train_cnt)))
print("Test. Ham: {0}, Spam: {1} ({2:.0f}%)".format(ham_test_cnt, spam_test_cnt,
                                                      100 * spam_test_cnt / (ham_test_cnt + spam_test_cnt)))

Resulting statistics
Total number of files = 33708. Train: 23599, Test: 10109 (30%)
Train. Ham: 11583, Spam: 12016 (51%)
Test. Ham: 4962, Spam: 5147 (51%)


Let's look at random examples

In [8]:
# ham
ham_files = os.listdir(os.path.join(dataset_folder, 'train', 'ham'))
with open(os.path.join(os.path.join(dataset_folder, 'train', 'ham'), ham_files[5000]), 'r') as f:
  text = f.read()
text

"Subject: august palo verde\nmy latest information on rate increases in california indicates that they will be implemented june lst . that means that customers don ' t see a price signal until june and won ' t actually see their bill until july . while i agree with greg that we may see dramatic demand side response once people see their bills , we have no idea how long it will take people and businesses to adjust their behavior . i don ' t really care what positions you guys want to carry . my only advice is that greg is short a fairly illiquid product . if you want to get out right now you would have to pay , at least for some of your position , north of $ 800 / mwh . in the near term i don ' t think that it will be easy to get out of this trade . in the long run , if the market comes off , it will be easy to get out .\nbottom line - -\nat the end of the day i think that you may be right , but things look very bullish right now . i want to make sure that you know that you have already

In [9]:
# spam
spam_files = os.listdir(os.path.join(dataset_folder, 'train', 'spam'))
with open(os.path.join(os.path.join(dataset_folder, 'train', 'spam'), spam_files[3000]), 'r') as f:
  text = f.read()
text

'Subject: message subject\ncentum situs syllabic bonnie decorticate petroglyph\nfind all your medications in one place !\na whole variety of pills ! have a look !\nyou name it ! we have it !\nstop receiving promotional material now\ncoralline dactyl quicken combinatorial cybernetics\n'

# Basic preprocessing

Take a folder with files and convert it to a dataframe. Create message_id, extract subject from mail text and remove \n from the message text

In [10]:
def preprocess_message_text(message_text):
  res = {'subject': None, 'text': None}
  lines = message_text.split('\n')
  if lines[0][:len('Subject:')] == 'Subject:':
    res['subject'] = lines[0][len('Subject: '):]
    res['text'] = ' '.join(lines[1:])
  else:
    res['text'] = ' '.join(lines)
  return res

In [11]:
preprocess_message_text(text)

{'subject': 'message subject',
 'text': 'centum situs syllabic bonnie decorticate petroglyph find all your medications in one place ! a whole variety of pills ! have a look ! you name it ! we have it ! stop receiving promotional material now coralline dactyl quicken combinatorial cybernetics '}

In [12]:
# test preprocessing on first 15 files in folder
import pandas as pd
def preprocess_mails_in_folder(folder_path):
  print('Processing files in folder {}'.format(folder_path))
  df = []
  all_files = os.listdir(folder_path)
  print('Number of files in folder: {}'.format(len(all_files)))
  processed_num = 0
  for file_name in all_files:
    if processed_num%100 == 0: print("{} files out of {}".format(
        processed_num, len(all_files)))
    with open(os.path.join(folder_path, file_name), 'r') as f:
      text = f.read()
    preprocessed_text = preprocess_message_text(text)
    preprocessed_text['message_id'] = file_name
    df.append(preprocessed_text)
    processed_num += 1
    if processed_num == 15: break
  print('Preparing dataframe')
  df = pd.DataFrame(df)
  print('Processing completed')
  return df

In [13]:
df = preprocess_mails_in_folder(folder_path=os.path.join(dataset_folder, 'test', 'ham'))

Processing files in folder /content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset/test/ham
Number of files in folder: 4962
0 files out of 4962
Preparing dataframe
Processing completed


In [14]:
df

Unnamed: 0,subject,text,message_id
0,prc for todd,"sally , attached below is a list of my accompl...",1496.2000-06-18.beck.ham.txt
1,var training,"sally , angela sprock forwarded your vm to me ...",1516.2000-06-19.beck.ham.txt
2,average eol and non - eol deals per day,"fyi - we are now distributing the "" john lavor...",1518.2000-06-19.beck.ham.txt
3,ena sap project,"as you know , we have substantially wrapped up...",1546.2000-06-20.beck.ham.txt
4,re : var training,"sally , i understand your comments . i ' ll ge...",1551.2000-06-20.beck.ham.txt
5,enron japan weekly update,"hello ejot , this is just a reminder about thu...",1556.2000-06-21.beck.ham.txt
6,sap id - here it is ! ! ! ! !,the following sap id and password allows you t...,1562.2000-06-21.beck.ham.txt
7,for discussion : implementing definition of tr...,this is the memo i was referring to - - - - - ...,1573.2000-06-21.beck.ham.txt
8,conference call text for discussion,- - - - - - - - - - - - - - - - - - - - - - fo...,1576.2000-06-21.beck.ham.txt
9,2000 accomplishments,sap related - - financial settlements develope...,1604.2000-06-23.beck.ham.txt


# Preprocess train and test datasets and save to csv

In [16]:
from spam_detector import basic_preprocessing

In [15]:
# Build a dataframe with training data

df_ham = basic_preprocessing.preprocess_mails_in_folder(folder_path=os.path.join(dataset_folder, 'train', 'ham'))
df_ham['label'] = 0
print()

df_spam = basic_preprocessing.preprocess_mails_in_folder(folder_path=os.path.join(dataset_folder, 'train', 'spam'))
df_spam['label'] = 1

df = pd.concat([df_ham, df_spam], ignore_index=True)
df.to_csv(os.path.join(dataset_folder, 'train.csv'), index=False)

Processing files in folder /content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset/train/ham
Number of files in folder: 11583
0 files out of 11583
1000 files out of 11583
2000 files out of 11583
3000 files out of 11583
4000 files out of 11583
5000 files out of 11583
6000 files out of 11583
7000 files out of 11583
8000 files out of 11583
9000 files out of 11583
10000 files out of 11583
11000 files out of 11583
Preparing dataframe
Processing completed

Processing files in folder /content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset/train/spam
Number of files in folder: 12016
0 files out of 12016
1000 files out of 12016
2000 files out of 12016
Processing error: 4210.2005-04-03.BG.spam.txt
Processing error: 2067.2004-12-12.BG.spam.txt
Processing error: 2180.2004-12-18.BG.spam.txt
3000 files out of 12016
Processing error: 2381.2004-12-28.BG.spam.txt
4000 files out of 12016
5000 files out of 12016
6000 files out of 12016
Processing error: 4102.2005-0

In [17]:
# Build a dataframe with testing data

df_ham = basic_preprocessing.preprocess_mails_in_folder(folder_path=os.path.join(dataset_folder, 'test', 'ham'))
df_ham['label'] = 0
df_spam = basic_preprocessing.preprocess_mails_in_folder(folder_path=os.path.join(dataset_folder, 'test', 'spam'))
df_spam['label'] = 1

df = pd.concat([df_ham, df_spam], ignore_index=True)
df.to_csv(os.path.join(dataset_folder, 'test.csv'), index=False)

Processing files in folder /content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset/test/ham
Number of files in folder: 4962
0 files out of 4962
1000 files out of 4962
2000 files out of 4962
3000 files out of 4962
4000 files out of 4962
Preparing dataframe
Processing completed
Processing files in folder /content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset/test/spam
Number of files in folder: 5147
0 files out of 5147
1000 files out of 5147
2000 files out of 5147
Processing error: 5268.2005-05-24.GP.spam.txt
3000 files out of 5147
4000 files out of 5147
Processing error: 4566.2005-05-24.GP.spam.txt
Processing error: 1938.2004-12-19.BG.spam.txt
5000 files out of 5147
Processing error: 2248.2004-09-23.GP.spam.txt
Preparing dataframe
Processing completed


In [18]:
# ensure all changes are saved on google drive
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
