# Email Spam detection
This tutorial is to show you how to make a very simple learning program that also utilizes gorubi solver to apply constraints on a multiclass classification for two classes `spam` and `regular`

In [None]:
!git clone https://github.com/HLR/DomiKnowS.git
%cd DomiKnowS 
!git checkout origin/Tasks
!pip install DomiKnowS

import logging
logging.basicConfig(level=logging.INFO)

import __main__
__main__.__file__="sentimentAnalysis.py"

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


## The Graph
First we define the graph code that defines the domain knowledge for this problem.

In [None]:
import os,sys,inspect

from domiknows.graph import Graph, Concept # importing basic graph classes
from domiknows.graph.logicalConstrain import orL, andL, notL # importing basic constraint classes

Graph.clear()
Concept.clear()

with Graph('example') as graph:
    email = Concept(name='email')

    Spam = email(name='spam')

    Regular = email(name='regular')

    # The constraint of not having regular and spam together
    orL(andL(notL(Spam), Regular, andL(notL(Regular), Spam)))

## Data and Data Reader
As our data is located in different text files and in different folders, we have to write a reader class that reads this entries into a list of dictionaries in python. Here we use the default Reader class of the Framework.


In [None]:
import os
from domiknows.data.reader import RegrReader

class EmailSpamReader(RegrReader):
    def parse_file(self, ):
        folder = self.file
        data_spam = []
        data_ham = []
        for file in [f for f in os.listdir(folder + "/spam") if os.path.isfile(os.path.join(folder + "/spam", f)) and f.endswith('.txt')]:
            with open(folder + "/spam/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_spam.append(x)
        for file in [f for f in os.listdir(folder + "/ham") if os.path.isfile(os.path.join(folder + "/ham", f)) and f.endswith('.txt')]:
            with open(folder + "/ham/" + file, "r") as f:
                x = []
                for i in f:
                    x.append(i)
            data_ham.append(x)
        final_data = []
        for dat in data_spam:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "spam"
            final_data.append(item)

        for dat in data_ham:
            item = {'subject': dat[0].split(":")[1]}
            index = [i for i, v in enumerate(dat) if v.startswith('- - - - - - - - -')]
            if len(index):
                index = index[0]
                item['body'] = "".join(dat[1:index])
                sub = [(i, v) for i, v in enumerate(dat[index:]) if v.startswith('subject')][0]
                item['forward_subject'] = sub[1].split(":")[1]
                item['forward_body'] = "".join(dat[index + sub[0] + 1:])
            else:
                item['body'] = item['body'] = ("").join(dat[1:])
            item['label'] = "ham"
            final_data.append(item)
        return final_data

    def getSubjectval(self, item):
        return item['subject']

    def getBodyval(self, item):
        return item['body']

    def getForwardSubjectval(self, item):
        if 'forward_subject' in item:
            return item['forward_subject']
        else:
            return None

    def getForwardBodyval(self, item):
        if 'forward_body' in item:
            return item['forward_body']
        else:
            return None

    def getSpamval(self, item):
        if item['label'] == "spam":
            return [1]
        else:
            return [0]

    def getRegularval(self, item):
        if item['label'] == "ham":
            return [1]
        else:
            return [0]

This class redefines the `parse_file` function to parse data into a list of dictionary and then defines some keywords to be used by `ReaderSensor` later in our program to connect data with our knowledge graph. Next we make an instance of this class on the training samples.

In [None]:
import os

train_reader = EmailSpamReader(file='Email_Spam/data/train', type="folder")
test_reader = EmailSpamReader(file='Email_Spam/data/test', type="folder")

You can check your very first instance by calling `next` and your reader. 
! Make sure to re-initiate your reader if you do call `next` for test.

In [None]:
print(next(iter(train_reader)))

{'Body': "i truely solicite your assistance for a business proposal .\ndear friend ,\nit is my absolute confidence to ensure this urgent and important businees proposal with you . i am greatly optimistic to forward you this note , as regards your assistance to enable me execute a venture of mutual benefit with you .\nmy name is mr daniel mutade , a senior employee with the central bank of zimbabwe . during the last national election held by president robert mugabe , i and my colleagues worked out twenty million united states dollars ( us $ 20 m ) as over - invoiced and inflated payment for election materials and the fund is now deposited with a security company in europe for safe keeping . we are not sure of the future of our country zimbabwe , due to the cry of more sanctions by world leaders in and around the world as a result of the brutal take over of white farmers land and residents in zimbabwe by the mugabe ' s administration .\nmy colleagues and i are now seeking to secure and i

## Model Declaration
Now we start to connect the reader output data with our formatted domain knowledge defined in the graph.

In [None]:
from domiknows.sensor.pytorch.sensors import ReaderSensor

email['subject'] = ReaderSensor(keyword='Subject')
email['body'] = ReaderSensor(keyword="Body")
email['forward_subject'] = ReaderSensor(keyword="ForwardSubject")
email['forward_body'] = ReaderSensor(keyword="ForwardBody")

Next we read the labels for the `spam` and `regular` concepts

In [None]:
email[Spam] = ReaderSensor(keyword='Spam', label=True)
email[Regular] = ReaderSensor(keyword='Regular', label=True)

### Define a new sensor
Here we want to use spacy to define a new sensor which gives us an average glove embedding tensor for a sentence

In [None]:
from domiknows.sensor.pytorch.sensors import TorchSensor, FunctionalSensor
import spacy
from typing import Any
import torch
import en_core_web_lg


class SentenceRepSensor(FunctionalSensor):
    def __init__(self, *pres, **kwarg):
        super().__init__(*pres, **kwarg)
        self.nlp = en_core_web_lg.load()

    def forward(self, *inputs) -> Any:
        email = self.nlp(inputs[0])
        return torch.from_numpy(email.vector).to(device=self.device).unsqueeze(0)

The input to this sensor would be a sentence. You can find the usage of this sensor in the following sections.

Next, we want to define a new sensor which gives us a tensor indicating whether the email has a forwarded message or not.

In [None]:
def presence_detector(*inputs) -> Any:
    if inputs[0] != None:
        return torch.ones(1,1)
    else:
        return torch.zeros(1,1)

### Connecting new sensors to the graph 
We connect these sensors to the graph to make new properties on the concept `email`. We want to make new representations on the `subject` and `body` of the email and that why those properties are passed as input to the defined sensors.

In [None]:
email['subject_rep'] = SentenceRepSensor('subject')
email['body_rep'] = SentenceRepSensor('body')
email['forward_presence'] = FunctionalSensor('forward_body', forward=presence_detector)

### Preparing input features for the learner
Now we concatenate all the generated features to make a new property on the graph which will provide input for the classifier of `spam` and `regular` concepts.

In [None]:
email['features'] = FunctionalSensor('subject_rep', 'body_rep', 'forward_presence', forward=lambda *x : torch.cat((x), dim=-1))

### Define the learner
Here we define a learner and connect it to the concepts of `spam` and `regular`. This learner is a simple pytorch module of linear neural network.

In [None]:
from domiknows.sensor.pytorch.learners import ModuleLearner
from torch import nn

email[Spam] = ModuleLearner('features', module=nn.Linear(601, 2))
email[Regular] = ModuleLearner('features', module=nn.Linear(601, 2))

### Make the learning model from the updated graph
Here we make an executable version of this graph that is able to trace the dependencies of the sensors and fill the data from the reader to run examples on the declared model.

In [None]:
from domiknows.program import POIProgram, IMLProgram, SolverPOIProgram
from domiknows.program.model.pytorch import PoiModel
from domiknows.program.metric import MacroAverageTracker, PRF1Tracker, PRF1Tracker, DatanodeCMMetric
from domiknows.program.loss import NBCrossEntropyLoss

program = SolverPOIProgram(graph, inferTypes=['ILP', 'local/argmax'], loss=MacroAverageTracker(NBCrossEntropyLoss()), metric={'ILP':PRF1Tracker(DatanodeCMMetric()),'argmax':PRF1Tracker(DatanodeCMMetric('local/argmax'))})


In [None]:
# set logger level to see training and testing logs
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
program.train(train_reader, train_epoch_num=10, Optim=torch.optim.Adam, device='auto')
program.test(test_reader)

Epoch 1 Training:  10%|█         | 1/10 [00:00<00:04,  2.00it/s]

Log file for ilpOntSolver is in: /content/DomiKnowS/logs/ilpOntSolver.log
Log file for ilpOntSolverTime is in: /content/DomiKnowS/logs/ilpOntSolver.log


Epoch 1 Training: 100%|██████████| 10/10 [00:01<00:00,  5.58it/s]
Epoch 2 Training: 100%|██████████| 10/10 [00:01<00:00,  7.02it/s]
Epoch 3 Training: 100%|██████████| 10/10 [00:00<00:00, 12.10it/s]
Epoch 4 Training: 100%|██████████| 10/10 [00:00<00:00, 12.17it/s]
Epoch 5 Training: 100%|██████████| 10/10 [00:00<00:00, 12.41it/s]
Epoch 6 Training: 100%|██████████| 10/10 [00:00<00:00, 12.16it/s]
Epoch 7 Training: 100%|██████████| 10/10 [00:00<00:00, 12.53it/s]
Epoch 8 Training: 100%|██████████| 10/10 [00:00<00:00, 12.28it/s]
Epoch 9 Training: 100%|██████████| 10/10 [00:00<00:00, 12.63it/s]
Epoch 10 Training: 100%|██████████| 10/10 [00:00<00:00, 12.43it/s]
Testing: 100%|██████████| 10/10 [00:00<00:00, 10.03it/s]


## Run the graph
Here we use populate to run the graph with the defined data from the reader

In [None]:
for datanode in program.populate(dataset=test_reader):
    print('datanode:', datanode)
    print('Spam:', datanode.getAttribute(Spam).softmax(-1))
    print('Regular:', datanode.getAttribute(Regular).softmax(-1))
#     datanode.inferILPResults(fun=lambda val: torch.tensor(val).softmax(dim=-1).detach().cpu().numpy().tolist(), epsilon=None)
    print('inference spam:', datanode.getAttribute(Spam, 'ILP'))
    print('inference regular:', datanode.getAttribute(Regular, 'ILP'))

2it [00:00, 17.00it/s]

datanode: email 0
Spam: tensor([0.0329, 0.9671])
Regular: tensor([0.8905, 0.1095])
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.3142, 0.6858])
Regular: tensor([0.4669, 0.5331])
inference spam: tensor([1.])
inference regular: tensor([1.])
datanode: email 0
Spam: tensor([0.4158, 0.5842])
Regular: tensor([0.4903, 0.5097])
inference spam: tensor([1.])
inference regular: tensor([1.])


6it [00:00, 11.47it/s]

datanode: email 0
Spam: tensor([0.3978, 0.6022])
Regular: tensor([0.9004, 0.0996])
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.0504, 0.9496])
Regular: tensor([0.9122, 0.0878])
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.4070, 0.5930])
Regular: tensor([0.4993, 0.5007])
inference spam: tensor([1.])
inference regular: tensor([1.])
datanode: email 0
Spam: tensor([0.3122, 0.6878])
Regular: tensor([0.8245, 0.1755])
inference spam: tensor([1.])
inference regular: tensor([0.])


10it [00:00, 13.26it/s]

datanode: email 0
Spam: tensor([0.7905, 0.2095])
Regular: tensor([0.2728, 0.7272])
inference spam: tensor([0.])
inference regular: tensor([1.])
datanode: email 0
Spam: tensor([0.3657, 0.6343])
Regular: tensor([0.8031, 0.1969])
inference spam: tensor([1.])
inference regular: tensor([0.])
datanode: email 0
Spam: tensor([0.9054, 0.0946])
Regular: tensor([0.0896, 0.9104])
inference spam: tensor([0.])
inference regular: tensor([1.])



