# Email Spam detection
This tutorial is to show you how to make a very simple learning program that also utilizes gorubi solver to apply constraints on a multiclass classification for two classes `spam` and `regular`

In [1]:
import os
print(os.getcwd())
# Please change the root to an absolute or relative path to DomiKnowS root.
# In case relative path is used, consider the printed `CWD` as current working directory.
root = r'C:\Users\timlm\Documents\GitHub\DomiKnowS'

c:\Users\timlm\Documents\GitHub\DomiKnowS\tutorials


## The Graph
First we define the graph code that defines the domain knowledge for this problem.

In [2]:
import sys
sys.path.append(root)

from regr.graph import Graph, Concept # importing basic graph classes
from regr.graph.logicalConstrain import orL, andL, notL # importing basic constraint classes

Graph.clear()
Concept.clear()

with Graph('example') as graph:
    email = Concept(name='email')

    Spam = email(name='spam')

    Regular = email(name='regular')

    # The constraint of not having regular and spam together
    orL(andL(notL(Spam, ('x', )), Regular, ('x', )), andL(notL(Regular, ('x', )), Spam, ('x', )))



Log file for dataNode is in: c:\Users\timlm\Documents\GitHub\DomiKnowS\tutorials\datanode.log


## Data and Data Reader
As our data is located in different text files and in different folders, we have to write a reader class that reads this entries into a list of dictionaries in python. Here we use the default Reader class of the Framework.


In [3]:
import os
from regr.data.reader import RegrReader

class EmailSpamReader(RegrReader):
    def parse_file(self, ):
        folder = self.file
        data_spam = []
        data_ham = []
        for file in [f for f in os.listdir(folder + "/spam") if os.path.isfile(os.path.join(folder + "/spam", f)) and f.endswith('.txt')]:
            with open(folder + "/spam/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_spam.append(x)
        for file in [f for f in os.listdir(folder + "/ham") if os.path.isfile(os.path.join(folder + "/ham", f)) and f.endswith('.txt')]:
            with open(folder + "/ham/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_ham.append(x)
        final_data = []
        for dat in data_spam:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "spam"
            final_data.append(item)

        for dat in data_ham:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "ham"
            final_data.append(item)
        return final_data

    def getSubjectval(self, item):
        return [item['subject']]

    def getBodyval(self, item):
        return [item['body']]

    def getForwardSubjectval(self, item):
        if 'forward_subject' in item:
            return [item['forward_subject']]
        else:
            return None

    def getForwardBodyval(self, item):
        if 'forward_body' in item:
            return [item['forward_body']]
        else:
            return None

    def getSpamval(self, item):
        if item['label'] == "spam":
            return [1]
        else:
            return [0]

    def getRegularval(self, item):
        if item['label'] == "ham":
            return [1]
        else:
            return [0]

This class redefines the `parse_file` function to parse data into a list of dictionary and then defines some keywords to be used by `ReaderSensor` later in our program to connect data with our knowledge graph. Next we make an instance of this class on the training samples.

In [4]:
import os

train_reader = EmailSpamReader(file=os.path.join(root, 'examples/Email_Spam/data/train'), type="folder")
test_reader = EmailSpamReader(file=os.path.join(root, 'examples/Email_Spam/data/test'), type="folder")

You can check your very first instance by calling `next` and your reader. 
! Make sure to re-initiate your reader if you do call `next` for test.

In [5]:
print(next(iter(train_reader)))

{'Body': ["the lowest life insurance quotes\nwithout the hassle !\ncompare rates from the\nnation ' s top insurance companies\nshop ,\ncompare and save\nfill out the simple form ,\nand you ' ll have the\n15 best custom quotes in 1 minute .\ncompare your current coverage\nto these sample 10 - year level term monthly\npremiums\n( 20 year , 30 year and smoker rates also available )\n$ 250 , 000\n$ 500 , 000\n$ 1 , 000 , 000\nage\nmale\nfemale\nmale\nfemale\nmale\nfemale\n30\n$ 12\n$ 11\n$ 19\n$ 15\n$ 31\n$ 27\n40\n$ 15\n$ 13\n$ 26\n$ 21\n$ 38\n$ 37\n50\n$ 32\n$ 24\n$ 59\n$ 43\n$ 107\n$ 78\n60\n$ 75\n$ 46\n$ 134\n$ 87\n$ 259\n$ 161\nclick here to compare !\nit ' s fast , easy and free !\n* all quotes shown are from insurance companies rated a - , a , a + or a + + by\na . m . best company ( a registered rating service ) and include all fees and commissions .\nactual premiums and coverage availability will vary depending upon age , sex , state\navailability , health history and recent tobacc

## Model Declaration
Now we start to connect the reader output data with our formatted domain knowledge defined in the graph.

In [6]:
from regr.sensor.pytorch.sensors import ReaderSensor

email['subject'] = ReaderSensor(keyword='Subject')
email['body'] = ReaderSensor(keyword="Body")
email['forward_subject'] = ReaderSensor(keyword="ForwardSubject")
email['forward_body'] = ReaderSensor(keyword="ForwardBody")

Next we read the labels for the `spam` and `regular` concepts

In [7]:
email[Spam] = ReaderSensor(keyword='Spam', label=True)
email[Regular] = ReaderSensor(keyword='Regular', label=True)

### Define a new sensor
Here we want to use spacy to define a new sensor which gives us an average glove embedding tensor for a sentence

In [8]:
from regr.sensor.pytorch.sensors import FunctionalSensor
import spacy
from typing import Any
import torch

class SentenceRepSensor(FunctionalSensor):
    def __init__(self, *pres, edges=None, label=False):
        super().__init__(*pres, edges=None, label=False)
        self.nlp = spacy.load('en_core_web_sm')

    def forward(self, text) -> Any:
        email = list(self.nlp.pipe(text))
        return torch.tensor([it.vector for it in email], device=self.device)

The input to this sensor would be a sentence. You can find the usage of this sensor in the following sections.

Next, we want to define a new sensor which gives us a tensor indicating whether the email has a forwarded message or not.

In [9]:
class ForwardPresenceSensor(FunctionalSensor):
    def forward(self, forward_body) -> Any:
        if forward_body:
            return torch.ones(1,1).to(self.device)
        else:
            return torch.zeros(1,1).to(self.device)

### Connecting new sensors to the graph 
We connect these sensors to the graph to make new properties on the concept `email`. We want to make new representations on the `subject` and `body` of the email and that why those properties are passed as input to the defined sensors.

In [10]:
email['subject_rep'] = SentenceRepSensor('subject')
email['body_rep'] = SentenceRepSensor('body')
email['forward_presence'] = ForwardPresenceSensor('forward_body')

### Preparing input features for the learner
Now we concatenate all the generated features to make a new property on the graph which will provide input for the classifier of `spam` and `regular` concepts.
We can use the `FunctionalSensor` and assign the concatination functionality to the `forward` parameter when initializing an instance.

In [11]:
def concat(*x): 
    return torch.cat(x, dim=-1)
email['features'] = FunctionalSensor('subject_rep', 'body_rep', 'forward_presence', forward=concat)

### Define the learner
Here we define a learner and connect it to the concepts of `spam` and `regular`. This learner is a simple pytorch module of linear neural network.

In [12]:
from regr.sensor.pytorch.learners import ModuleLearner
from torch import nn

email[Spam] = ModuleLearner('features', module=nn.Linear(193, 2))
email[Regular] = ModuleLearner('features', module=nn.Linear(193, 2))

### Make the learning model from the updated graph
Here we make an executable version of this graph that is able to trace the dependencies of the sensors and fill the data from the reader to run examples on the declared model.

In [13]:
from regr.program import LearningBasedProgram
from regr.program.model.pytorch import PoiModel
from regr.program.metric import MacroAverageTracker, PRF1Tracker
from regr.program.loss import NBCrossEntropyLoss

program = LearningBasedProgram(graph, PoiModel, loss=MacroAverageTracker(NBCrossEntropyLoss()), metric=PRF1Tracker())

We can change the level of logging from the program using the following setting.

In [14]:
# set logger level to see training and testing logs
import logging
logging.basicConfig(level=logging.INFO)

### Execute The program to train
We can use the `train` method to start training the model based on defined loss. We can specify the number of training epochs, the dataset and the optimizer.

In [15]:
program.train(train_reader, train_epoch_num=10, Optim=torch.optim.Adam, device='auto')


INFO:regr.program.program:Epoch: 0
INFO:regr.program.program:Training:
Epoch 0 Training: 100%|██████████| 10/10 [00:00<00:00, 10.84it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.7192), 'regular': tensor(0.6869)}
INFO:regr.program.program: - metric:
INFO:regr.program.program:{'spam': {'P': tensor(0.4667), 'R': tensor(0.7000), 'F1': tensor(0.5600)}, 'regular': {'P': tensor(0.6667), 'R': tensor(0.6000), 'F1': tensor(0.6316)}}
INFO:regr.program.program:Epoch: 1
INFO:regr.program.program:Training:
Epoch 1 Training: 100%|██████████| 10/10 [00:00<00:00, 15.49it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.6136), 'regular': tensor(0.5849)}
INFO:regr.program.program: - metric:
INFO:regr.program.program:{'spam': {'P': tensor(0.6154), 'R': tensor(0.8000), 'F1': tensor(0.6957)}, 'regular': {'P': tensor(0.7500), 'R': tensor(0.9000), 'F1': tensor(0.8182)}}
INFO:regr.program.program:Epoch: 2
INFO:regr.program.program:Training:

In order to test the performance on any dataset with the trained model, we should call `test` on the program and pass the dataset instance to it.

In [16]:
program.test(test_reader)

INFO:regr.program.program:Testing:
Testing: 100%|██████████| 10/10 [00:00<00:00, 11.96it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.6098), 'regular': tensor(0.6298)}
INFO:regr.program.program: - metric:
INFO:regr.program.program:{'spam': {'P': tensor(0.5455), 'R': tensor(0.6000), 'F1': tensor(0.5714)}, 'regular': {'P': tensor(0.5000), 'R': tensor(0.6000), 'F1': tensor(0.5455)}}


## Run the graph
Here we use populate to run the graph with the defined data from the reader

In [17]:
for datanode in program.populate(dataset=train_reader):
    print(datanode)

email 0
email 0
email 0
email 0
email 0
email 0
email 0
email 0
email 0
email 0


In [18]:
for datanode in program.populate(dataset=test_reader):
    print('datanode:', datanode)
    print('Spam:', datanode.getAttribute(Spam).softmax(-1))
    print('Regular:', datanode.getAttribute(Regular).softmax(-1))
    datanode.inferILPConstrains(fun=lambda val: torch.tensor(val).softmax(dim=-1).detach().cpu().numpy().tolist(), epsilon=None)
    print('inference spam:', datanode.getAttribute(Spam, 'ILP'))
    print('inference regular:', datanode.getAttribute(Regular, 'ILP'))

datanode: email 0
Spam: tensor([0.5562, 0.4438])
Regular: tensor([0.4286, 0.5714])
Log file for ilpOntSolver is in: c:\Users\timlm\Documents\GitHub\DomiKnowS\tutorials\ilpOntSolver.log
Academic license - for non-commercial use only - expires 2021-04-13
  if self.ifLog: self.myLogger.debug("%s constrain already created - doing nothing"(logicMethodName))
INFO:gurobipy.gurobipy:Academic license - for non-commercial use only - expires 2021-04-13
Using license file C:\Users\timlm\gurobi.lic
INFO:gurobipy.gurobipy:Using license file C:\Users\timlm\gurobi.lic
inference spam: tensor([0.], dtype=torch.float64)
inference regular: tensor([1.], dtype=torch.float64)
datanode: email 0
Spam: tensor([0.2690, 0.7310])
Regular: tensor([0.6692, 0.3308])
inference spam: tensor([1.], dtype=torch.float64)
inference regular: tensor([0.], dtype=torch.float64)
datanode: email 0
Spam: tensor([0.3849, 0.6151])
Regular: tensor([0.6318, 0.3682])
inference spam: tensor([1.], dtype=torch.float64)
inference regular: 

In [24]:
from graphviz import Graph
#for datanode in program.populate(dataset=test_reader):
dot = Graph()
dot.node('Email', str(datanode))
dot.attr('node', shape = 'square') 
dot.node('Spam', 'Spam: '+str(datanode.getAttribute('subject')[0]))
dot.node('Regular', 'Regular: '+str(datanode.getAttribute(Regular).softmax(-1)))
dot.edge('Email', 'Spam', constraint = 'false')
dot.edge('Email', 'Regular', constraing = 'false')
dot.render('Graph', view = True)

'Graph.pdf'