# Email Spam detection
This tutorial is to show you how to make a very simple learning program that also utilizes gorubi solver to apply constraints on a multiclass classification for two classes `spam` and `regular`

In [1]:
import os
print(os.getcwd())
# Please change the root to an absolute or relative path to DomiKnowS root.
# In case relative path is used, consider the printed `CWD` as current working directory.
root = '/home/hfaghihi/Framework/DomiKnowS'

/VL/space/guoquan/repos/RelationalGraph/tutorials


## The Graph
First we define the graph code that defines the domain knowledge for this problem.

In [2]:
import sys
sys.path.append(root)

from regr.graph import Graph, Concept # importing basic graph classes
from regr.graph.logicalConstrain import orL, andL, notL # importing basic constraint classes

Graph.clear()
Concept.clear()

with Graph('example') as graph:
    email = Concept(name='email')

    Spam = email(name='spam')

    Regular = email(name='regular')

    # The constraint of not having regular and spam together
    orL(andL(notL(Spam, ('x', )), Regular, ('x', )), andL(notL(Regular, ('x', )), Spam, ('x', )))





## Data and Data Reader
As our data is located in different text files and in different folders, we have to write a reader class that reads this entries into a list of dictionaries in python. Here we use the default Reader class of the Framework.


In [3]:
import os
from regr.data.reader import RegrReader

class EmailSpamReader(RegrReader):
    def parse_file(self, ):
        folder = self.file
        data_spam = []
        data_ham = []
        for file in [f for f in os.listdir(folder + "/spam") if os.path.isfile(os.path.join(folder + "/spam", f)) and f.endswith('.txt')]:
            with open(folder + "/spam/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_spam.append(x)
        for file in [f for f in os.listdir(folder + "/ham") if os.path.isfile(os.path.join(folder + "/ham", f)) and f.endswith('.txt')]:
            with open(folder + "/ham/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_ham.append(x)
        final_data = []
        for dat in data_spam:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "spam"
            final_data.append(item)

        for dat in data_ham:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "ham"
            final_data.append(item)
        return final_data

    def getSubjectval(self, item):
        return item['subject']

    def getBodyval(self, item):
        return item['body']

    def getForwardSubjectval(self, item):
        if 'forward_subject' in item:
            return item['forward_subject']
        else:
            return None

    def getForwardBodyval(self, item):
        if 'forward_body' in item:
            return item['forward_body']
        else:
            return None

    def getSpamval(self, item):
        if item['label'] == "spam":
            return [1]
        else:
            return [0]

    def getRegularval(self, item):
        if item['label'] == "ham":
            return [1]
        else:
            return [0]

This class redefines the `parse_file` function to parse data into a list of dictionary and then defines some keywords to be used by `ReaderSensor` later in our program to connect data with our knowledge graph. Next we make an instance of this class on the training samples.

In [4]:
import os

train_reader = EmailSpamReader(file=os.path.join(root, 'examples/Email_Spam/data/train'), type="folder")
test_reader = EmailSpamReader(file=os.path.join(root, 'examples/Email_Spam/data/test'), type="folder")

You can check your very first instance by calling `next` and your reader. 
! Make sure to re-initiate your reader if you do call `next` for test.

In [5]:
print(next(iter(train_reader)))

{'Body': 'take control of your life today\nand make the money that you deserve ! welcome to the last stop in your search for a true and total money making opportunity . we have\nstudied marketing for years and have finally come to where we are able to deliver to you the most complete package available today ! if you are looking for unlimited income\npotential then this is the chance for you . this is a real business , not multi - level - marketing , network marketing or any other type of business that you have seen in the past . it is a complete state of the art system that comes complete with everything that you need to succeed . find out how to capture the most exciting opportunity in years ! click here\nautomated\nremoval instructionsthis\nmessage is intended for individuals who have an interest in financial and money matters . if this message has reached you in error , and you want to be removed from our mail agents database , click here and type your email address in the subject h

## Model Declaration
Now we start to connect the reader output data with our formatted domain knowledge defined in the graph.

In [6]:
from regr.sensor.pytorch.sensors import ReaderSensor

email['subject'] = ReaderSensor(keyword='Subject')
email['body'] = ReaderSensor(keyword="Body")
email['forward_subject'] = ReaderSensor(keyword="ForwardSubject")
email['forward_body'] = ReaderSensor(keyword="ForwardBody")

Next we read the labels for the `spam` and `regular` concepts

In [7]:
email[Spam] = ReaderSensor(keyword='Spam', label=True)
email[Regular] = ReaderSensor(keyword='Regular', label=True)

### Define a new sensor
Here we want to use spacy to define a new sensor which gives us an average glove embedding tensor for a sentence

In [8]:
from regr.sensor.pytorch.sensors import TorchSensor
import spacy
from typing import Any
import torch

class SentenceRepSensor(TorchSensor):
    def __init__(self, *pres, edges=None, label=False):
        super().__init__(*pres, edges=None, label=False)
        self.nlp = spacy.load('en_core_web_lg')

    def forward(self,) -> Any:
        email = self.nlp(self.inputs[0])
        return torch.from_numpy(email.vector).to(device=self.device)

The input to this sensor would be a sentence. You can find the usage of this sensor in the following sections.

Next, we want to define a new sensor which gives us a tensor indicating whether the email has a forwarded message or not.

In [9]:
class ForwardPresenceSensor(TorchSensor):
    def forward(self,) -> Any:
        if self.inputs[0]:
            return torch.ones(1).to(self.device)
        else:
            return torch.zeros(1).to(self.device)

### Connecting new sensors to the graph 
We connect these sensors to the graph to make new properties on the concept `email`. We want to make new representations on the `subject` and `body` of the email and that why those properties are passed as input to the defined sensors.

In [10]:
email['subject_rep'] = SentenceRepSensor('subject')
email['body_rep'] = SentenceRepSensor('body')
email['forward_presence'] = ForwardPresenceSensor('forward_body')

### Preparing input features for the learner
Now we concatenate all the generated features to make a new property on the graph which will provide input for the classifier of `spam` and `regular` concepts.

In [11]:
from regr.sensor.pytorch.sensors import ConcatSensor

email['features'] = ConcatSensor('subject_rep', 'body_rep', 'forward_presence')

### Define the learner
Here we define a learner and connect it to the concepts of `spam` and `regular`. This learner is a simple pytorch module of linear neural network.

In [12]:
from regr.sensor.pytorch.learners import ModuleLearner
from torch import nn

email[Spam] = ModuleLearner('features', module=nn.Linear(601, 2))
email[Regular] = ModuleLearner('features', module=nn.Linear(601, 2))

### Make the learning model from the updated graph
Here we make an executable version of this graph that is able to trace the dependencies of the sensors and fill the data from the reader to run examples on the declared model.

In [13]:
from regr.program import LearningBasedProgram
from regr.program.model.pytorch import PoiModel
from regr.program.metric import MacroAverageTracker, PRF1Tracker
from regr.program.loss import NBCrossEntropyLoss

program = LearningBasedProgram(graph, PoiModel, loss=MacroAverageTracker(NBCrossEntropyLoss()), metric=PRF1Tracker())

In [14]:
# set logger level to see training and testing logs
import logging
logging.basicConfig(level=logging.INFO)

In [15]:
program.train(train_reader, train_epoch_num=30, Optim=torch.optim.Adam, device='auto')


INFO:regr.program.program:Epoch: 0
INFO:regr.program.program:Training:
Epoch 0 Training: 100%|██████████| 10/10 [00:01<00:00,  9.58it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.7216), 'regular': tensor(0.6997)}
INFO:regr.program.program: - metric:
INFO:regr.program.program:{'spam': {'P': tensor(0.3333, device='cuda:0'), 'R': tensor(0.3000, device='cuda:0'), 'F1': tensor(0.3158, device='cuda:0')}, 'regular': {'P': tensor(0.4444, device='cuda:0'), 'R': tensor(0.4000, device='cuda:0'), 'F1': tensor(0.4211, device='cuda:0')}}
INFO:regr.program.program:Epoch: 1
INFO:regr.program.program:Training:
Epoch 1 Training: 100%|██████████| 10/10 [00:00<00:00, 18.49it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.6018), 'regular': tensor(0.5835)}
INFO:regr.program.program: - metric:
INFO:regr.program.program:{'spam': {'P': tensor(0.7273, device='cuda:0'), 'R': tensor(0.8000, device='cuda:0'), 'F1': tensor(0.7619, device='cuda:

## Run the graph
Here we use populate to run the graph with the defined data from the reader

In [16]:
for datanode in program.populate(dataset=train_reader):
    print(datanode)

email 0
email 0
email 0
email 0
email 0
email 0
email 0
email 0
email 0
email 0


In [17]:
for datanode in program.populate(dataset=test_reader):
    print('datanode:', datanode)
    print('Spam:', datanode.getAttribute(Spam).softmax(-1))
    print('Regular:', datanode.getAttribute(Regular).softmax(-1))
    datanode.inferILPConstrains(fun=lambda val: torch.tensor(val).softmax(dim=-1).detach().cpu().numpy().tolist(), epsilon=None)
    print('inference spam:', datanode.getAttribute(Spam, 'ILP'))
    print('inference regular:', datanode.getAttribute(Regular, 'ILP'))

datanode: email 0
Spam: tensor([0.2630, 0.7370], device='cuda:0')
Regular: tensor([0.6415, 0.3585], device='cuda:0')
Log file for ilpOntSolver is in: /VL/space/guoquan/repos/RelationalGraph/tutorials/ilpOntSolver.log
Using license file /home/guoquan/gurobi.lic
INFO:gurobipy:Using license file /home/guoquan/gurobi.lic
Academic license - for non-commercial use only
INFO:gurobipy:Academic license - for non-commercial use only
inference spam: tensor([1.], device='cuda:0')
inference regular: tensor([0.], device='cuda:0')
datanode: email 0
Spam: tensor([0.3724, 0.6276], device='cuda:0')
Regular: tensor([0.6633, 0.3367], device='cuda:0')
inference spam: tensor([1.], device='cuda:0')
inference regular: tensor([0.], device='cuda:0')
datanode: email 0
Spam: tensor([0.4792, 0.5208], device='cuda:0')
Regular: tensor([0.5152, 0.4848], device='cuda:0')
inference spam: tensor([1.], device='cuda:0')
inference regular: tensor([0.], device='cuda:0')
datanode: email 0
Spam: tensor([0.2375, 0.7625], devic