# Email Spam detection
This tutorial is to show you how to make a very simple learning program that also utilizes gorubi solver to apply constraints on a multiclass classification for two classes `spam` and `regular`

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
root = os.path.dirname(currentdir)
# print("root Folder Absoloute path: ", root)

import sys
sys.path.append(root)

import logging

logging.basicConfig(level=logging.INFO)

## The Graph
First we define the graph code that defines the domain knowledge for this problem.

In [2]:
from regr.graph import Graph, Concept # importing basic graph classes
from regr.graph.logicalConstrain import orL, andL, notL # importing basic constraint classes

Graph.clear()
Concept.clear()

with Graph('example') as graph:
    email = Concept(name='email')

    Spam = email(name='spam')

    Regular = email(name='regular')

    # The constraint of not having regular and spam together
    orL(andL(notL(Spam), Regular, andL(notL(Regular), Spam)))



Log file for dataNode is in: /home/hfaghihi/Framework/new/DomiKnowS/tutorials/datanode.log


## Data and Data Reader
As our data is located in different text files and in different folders, we have to write a reader class that reads this entries into a list of dictionaries in python. Here we use the default Reader class of the Framework.


In [3]:
import os
from regr.data.reader import RegrReader

class EmailSpamReader(RegrReader):
    def parse_file(self, ):
        folder = self.file
        data_spam = []
        data_ham = []
        for file in [f for f in os.listdir(folder + "/spam") if os.path.isfile(os.path.join(folder + "/spam", f)) and f.endswith('.txt')]:
            with open(folder + "/spam/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_spam.append(x)
        for file in [f for f in os.listdir(folder + "/ham") if os.path.isfile(os.path.join(folder + "/ham", f)) and f.endswith('.txt')]:
            with open(folder + "/ham/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_ham.append(x)
        final_data = []
        for dat in data_spam:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "spam"
            final_data.append(item)

        for dat in data_ham:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "ham"
            final_data.append(item)
        return final_data

    def getSubjectval(self, item):
        return item['subject']

    def getBodyval(self, item):
        return item['body']

    def getForwardSubjectval(self, item):
        if 'forward_subject' in item:
            return item['forward_subject']
        else:
            return None

    def getForwardBodyval(self, item):
        if 'forward_body' in item:
            return item['forward_body']
        else:
            return None

    def getSpamval(self, item):
        if item['label'] == "spam":
            return [1]
        else:
            return [0]

    def getRegularval(self, item):
        if item['label'] == "ham":
            return [1]
        else:
            return [0]

This class redefines the `parse_file` function to parse data into a list of dictionary and then defines some keywords to be used by `ReaderSensor` later in our program to connect data with our knowledge graph. Next we make an instance of this class on the training samples.

In [4]:
import os

train_reader = EmailSpamReader(file=os.path.join(root, 'examples/Email_Spam/data/train'), type="folder")
test_reader = EmailSpamReader(file=os.path.join(root, 'examples/Email_Spam/data/test'), type="folder")

You can check your very first instance by calling `next` and your reader. 
! Make sure to re-initiate your reader if you do call `next` for test.

In [5]:
print(next(iter(train_reader)))

{'Body': 'hi ,\nwe have a new offer for you . buy cheap viagra through our online store .\n- private online ordering\n- no prescription required\n- world wide shipping\norder your drugs offshore and save over 70 % !\nclick here : http : / / aamedical . net / meds /\nbest regards ,\ndonald cunfingham\nno thanks : http : / / aamedical . net / rm . html', 'ForwardBody': None, 'ForwardSubject': None, 'Regular': [0], 'Spam': [1], 'Subject': ' buy cheap viagra through us .\n'}


## Model Declaration
Now we start to connect the reader output data with our formatted domain knowledge defined in the graph.

In [6]:
from regr.sensor.pytorch.sensors import ReaderSensor

email['subject'] = ReaderSensor(keyword='Subject')
email['body'] = ReaderSensor(keyword="Body")
email['forward_subject'] = ReaderSensor(keyword="ForwardSubject")
email['forward_body'] = ReaderSensor(keyword="ForwardBody")

Next we read the labels for the `spam` and `regular` concepts

In [7]:
email[Spam] = ReaderSensor(keyword='Spam', label=True)
email[Regular] = ReaderSensor(keyword='Regular', label=True)

### Define a new sensor
Here we want to use spacy to define a new sensor which gives us an average glove embedding tensor for a sentence

In [8]:
from regr.sensor.pytorch.sensors import TorchSensor, FunctionalSensor
import spacy
from typing import Any
import torch

class SentenceRepSensor(FunctionalSensor):
    def __init__(self, *pres, **kwarg):
        super().__init__(*pres, **kwarg)
        self.nlp = spacy.load('en_core_web_lg')

    def forward(self, *inputs) -> Any:
        email = self.nlp(inputs[0])
        return torch.from_numpy(email.vector).to(device=self.device).unsqueeze(0)

The input to this sensor would be a sentence. You can find the usage of this sensor in the following sections.

Next, we want to define a new sensor which gives us a tensor indicating whether the email has a forwarded message or not.

In [9]:
def presence_detector(*inputs) -> Any:
    if inputs[0] != None:
        return torch.ones(1,1)
    else:
        return torch.zeros(1,1)

### Connecting new sensors to the graph 
We connect these sensors to the graph to make new properties on the concept `email`. We want to make new representations on the `subject` and `body` of the email and that why those properties are passed as input to the defined sensors.

In [10]:
email['subject_rep'] = SentenceRepSensor('subject')
email['body_rep'] = SentenceRepSensor('body')
email['forward_presence'] = FunctionalSensor('forward_body', forward=presence_detector)

### Preparing input features for the learner
Now we concatenate all the generated features to make a new property on the graph which will provide input for the classifier of `spam` and `regular` concepts.

In [11]:
from regr.sensor.pytorch.sensors import ConcatSensor

email['features'] = FunctionalSensor('subject_rep', 'body_rep', 'forward_presence', forward=lambda *x : torch.cat((x), dim=-1))

### Define the learner
Here we define a learner and connect it to the concepts of `spam` and `regular`. This learner is a simple pytorch module of linear neural network.

In [12]:
from regr.sensor.pytorch.learners import ModuleLearner
from torch import nn

email[Spam] = ModuleLearner('features', module=nn.Linear(601, 2))
email[Regular] = ModuleLearner('features', module=nn.Linear(601, 2))

### Make the learning model from the updated graph
Here we make an executable version of this graph that is able to trace the dependencies of the sensors and fill the data from the reader to run examples on the declared model.

In [13]:
from regr.program import POIProgram, IMLProgram, SolverPOIProgram
from regr.program.model.pytorch import PoiModel
from regr.program.metric import MacroAverageTracker, PRF1Tracker, PRF1Tracker, DatanodeCMMetric
from regr.program.loss import NBCrossEntropyLoss

program = SolverPOIProgram(graph, inferTypes=['ILP', 'local/argmax'], loss=MacroAverageTracker(NBCrossEntropyLoss()), metric={'ILP':PRF1Tracker(DatanodeCMMetric()),'argmax':PRF1Tracker(DatanodeCMMetric('local/argmax'))})


In [14]:
# set logger level to see training and testing logs
import logging
logging.basicConfig(level=logging.INFO)

In [15]:
program.train(train_reader, train_epoch_num=10, Optim=torch.optim.Adam, device='auto')


INFO:regr.program.program:Epoch: 1
INFO:regr.program.program:Training:
Epoch 1 Training:   0%|          | 0/10 [00:00<?, ?it/s]

Log file for ilpOntSolver is in: /home/hfaghihi/Framework/new/DomiKnowS/tutorials/ilpOntSolver.log
Academic license - for non-commercial use only - expires 2021-06-05


INFO:gurobipy.gurobipy:Academic license - for non-commercial use only - expires 2021-06-05


Using license file /home/hfaghihi/gurobi.lic


INFO:gurobipy.gurobipy:Using license file /home/hfaghihi/gurobi.lic
Epoch 1 Training: 100%|██████████| 10/10 [00:00<00:00, 18.98it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.7322), 'regular': tensor(0.6900)}
INFO:regr.program.program: - metric:
INFO:regr.program.program: - - ILP
INFO:regr.program.program:{'spam': {'P': tensor(0.4444), 'R': tensor(0.8000), 'F1': tensor(0.5714)}, 'regular': {'P': tensor(0.6667), 'R': tensor(0.8000), 'F1': tensor(0.7273)}}
INFO:regr.program.program: - - argmax
INFO:regr.program.program:{'spam': {'P': tensor(0.4444), 'R': tensor(0.8000), 'F1': tensor(0.5714)}, 'regular': {'P': tensor(0.6667), 'R': tensor(0.8000), 'F1': tensor(0.7273)}}
INFO:regr.program.program:Epoch: 2
INFO:regr.program.program:Training:
Epoch 2 Training: 100%|██████████| 10/10 [00:00<00:00, 22.00it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.6126), 'regular': tensor(0.5771)}
INFO:regr.program.program: - metric:


In [16]:
program.test(test_reader)

INFO:regr.program.program:Testing:
Testing: 100%|██████████| 10/10 [00:00<00:00, 18.38it/s]
INFO:regr.program.program: - loss:
INFO:regr.program.program:{'spam': tensor(0.5414), 'regular': tensor(0.5392)}
INFO:regr.program.program: - metric:
INFO:regr.program.program: - - ILP
INFO:regr.program.program:{'spam': {'P': tensor(0.8333), 'R': tensor(1.), 'F1': tensor(0.9091)}, 'regular': {'P': tensor(0.8000), 'R': tensor(0.8000), 'F1': tensor(0.8000)}}
INFO:regr.program.program: - - argmax
INFO:regr.program.program:{'spam': {'P': tensor(0.8333), 'R': tensor(1.), 'F1': tensor(0.9091)}, 'regular': {'P': tensor(0.8000), 'R': tensor(0.8000), 'F1': tensor(0.8000)}}


## Run the graph
Here we use populate to run the graph with the defined data from the reader

In [17]:
for datanode in program.populate(dataset=test_reader):
    print('datanode:', datanode)
    print('Spam:', datanode.getAttribute(Spam).softmax(-1))
    print('Regular:', datanode.getAttribute(Regular).softmax(-1))
#     datanode.inferILPResults(fun=lambda val: torch.tensor(val).softmax(dim=-1).detach().cpu().numpy().tolist(), epsilon=None)
    print('inference spam:', datanode.getAttribute(Spam, 'ILP'))
    print('inference regular:', datanode.getAttribute(Regular, 'ILP'))

datanode: email 0
Spam: tensor([0.3109, 0.6891], device='cuda:0')
Regular: tensor([0.6572, 0.3428], device='cuda:0')
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.3599, 0.6401], device='cuda:0')
Regular: tensor([0.6933, 0.3067], device='cuda:0')
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.3677, 0.6323], device='cuda:0')
Regular: tensor([0.5686, 0.4314], device='cuda:0')
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.2611, 0.7389], device='cuda:0')
Regular: tensor([0.7135, 0.2865], device='cuda:0')
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.4572, 0.5428], device='cuda:0')
Regular: tensor([0.4918, 0.5082], device='cuda:0')
inference spam: tensor([1.])
inference regular: tensor([1.])
datanode: email 0
Spam: tensor([0.5185, 0.4815], device='cuda:0')
Regular: tensor([0.4732, 0.5268], device='cu