In [3]:
import pandas as pd

import spacy
from spacy import displacy

from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
from spacy.tokens import Token

Token.set_extension("ignore", default=False, force=True)

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

nlp = spacy.load("en_core_web_trf")

# Train Classifier

In [2]:
sentences = pd.read_excel('../data/demo_data_augmented.xlsx')

In [16]:
sentences['Original Text EN'] = sentences['Original Text EN'].astype(str)

In [17]:
dataset = list(sentences[["Original Text EN", "Tags/Kategorie"]].sample(frac=1).itertuples(index=False, name=None))
train_data = dataset[:3000]
dev_data = dataset[3000:3200]
test_data = dataset[3200:3374]
print(f"Total: {len(dataset)} - Train:  {len(train_data)} - Dev: {len(dev_data)} - Test: {len(test_data)}")

Total: 3375 - Train:  3000 - Dev: 200 - Test: 174


In [18]:
labels_test = sentences['Tags/Kategorie'].apply(lambda x : x.split(","))

all_labels = []
for li in labels_test:
    for label in li:
        label = label.strip()
        all_labels.append(label)

all_labels = set(all_labels)
print(all_labels)
all_labels = list(all_labels)
all_labels


{'ERP', 'Python', 'API', 'DWH', 'Zertifikate', 'SQL', 'Kubernetes', 'Machine Learning', 'SAP', 'Grafana', 'CRM', 'HR', 'Allgemeine Definitionen', 'MLFlow', 'DeepL'}


['ERP',
 'Python',
 'API',
 'DWH',
 'Zertifikate',
 'SQL',
 'Kubernetes',
 'Machine Learning',
 'SAP',
 'Grafana',
 'CRM',
 'HR',
 'Allgemeine Definitionen',
 'MLFlow',
 'DeepL']

In [20]:
def convert(data, outfile):
    db = spacy.tokens.DocBin()
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        for l in all_labels:
            if l in label: 
                doc.cats[l] = 1
            else:
                doc.cats[l] = 0
        db.add(doc)

        print(doc.cats)
    
    db.to_disk(outfile)

In [21]:
convert(train_data, "./train.spacy")
convert(dev_data, "./dev.spacy")
convert(test_data, "./test.spacy")

{'ERP': 0, 'Python': 0, 'API': 0, 'DWH': 0, 'Zertifikate': 0, 'SQL': 0, 'Kubernetes': 1, 'Machine Learning': 0, 'SAP': 0, 'Grafana': 1, 'CRM': 0, 'HR': 0, 'Allgemeine Definitionen': 0, 'MLFlow': 0, 'DeepL': 0}
{'ERP': 0, 'Python': 0, 'API': 0, 'DWH': 0, 'Zertifikate': 0, 'SQL': 0, 'Kubernetes': 0, 'Machine Learning': 0, 'SAP': 0, 'Grafana': 0, 'CRM': 0, 'HR': 1, 'Allgemeine Definitionen': 0, 'MLFlow': 0, 'DeepL': 0}
{'ERP': 0, 'Python': 0, 'API': 0, 'DWH': 0, 'Zertifikate': 1, 'SQL': 0, 'Kubernetes': 0, 'Machine Learning': 0, 'SAP': 0, 'Grafana': 0, 'CRM': 0, 'HR': 0, 'Allgemeine Definitionen': 0, 'MLFlow': 1, 'DeepL': 0}
{'ERP': 0, 'Python': 0, 'API': 0, 'DWH': 1, 'Zertifikate': 0, 'SQL': 1, 'Kubernetes': 0, 'Machine Learning': 0, 'SAP': 0, 'Grafana': 0, 'CRM': 0, 'HR': 0, 'Allgemeine Definitionen': 0, 'MLFlow': 0, 'DeepL': 0}
{'ERP': 0, 'Python': 0, 'API': 0, 'DWH': 0, 'Zertifikate': 0, 'SQL': 0, 'Kubernetes': 0, 'Machine Learning': 0, 'SAP': 1, 'Grafana': 0, 'CRM': 0, 'HR': 1, 'Allg

In [22]:
!python -m spacy init config --lang pt --pipeline textcat_multilabel --optimize efficiency --force config.cfg

ℹ Generated config template specific for your use case
- Language: pt
- Pipeline: textcat_multilabel
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [26]:
!python -m spacy train config.cfg --paths.train ./train.spacy  --paths.dev ./dev.spacy --output model --verbose

ℹ Saving to output directory: model
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['textcat_multilabel']
ℹ Initial learn rate: 0.001
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       54.19    0.54
  0     200          12.30       99.94    1.00
  0     400           2.07      100.00    1.00
  0     600           0.64      100.00    1.00
  0     800           0.34      100.00    1.00
  0    1000           0.24      100.00    1.00
  1    1200           0.24      100.00    1.00
  1    1400           0.12      100.00    1.00
  2    1600           0.10      100.00    1.00
  3    1800           0.08      100.00    1.00
  3    2000           0.07      100.00    1.00
✔ Saved pipeline to output directory
model\model-last


[2022-12-07 19:20:39,582] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2022-12-07 19:20:39,992] [INFO] Set up nlp object from config
[2022-12-07 19:20:40,003] [DEBUG] Loading corpus from path: dev.spacy
[2022-12-07 19:20:40,004] [DEBUG] Loading corpus from path: train.spacy
[2022-12-07 19:20:40,004] [INFO] Pipeline: ['textcat_multilabel']
[2022-12-07 19:20:40,007] [INFO] Created vocabulary
[2022-12-07 19:20:40,008] [INFO] Finished initializing nlp object
[2022-12-07 19:20:41,565] [INFO] Initialized pipeline components: ['textcat_multilabel']
[2022-12-07 19:20:41,576] [DEBUG] Loading corpus from path: dev.spacy
[2022-12-07 19:20:41,577] [DEBUG] Loading corpus from path: train.spacy


In [27]:
!python -m spacy evaluate ./model/model-best/ ./test.spacy

ℹ Using CPU
[1m

TOK                   99.90 
TEXTCAT (macro AUC)   100.00
SPEED                 367962

[1m

                               P        R        F
ERP                       100.00   100.00   100.00
Python                    100.00   100.00   100.00
API                       100.00   100.00   100.00
DWH                       100.00   100.00   100.00
Zertifikate               100.00   100.00   100.00
SQL                       100.00   100.00   100.00
Kubernetes                100.00   100.00   100.00
Machine Learning          100.00   100.00   100.00
SAP                       100.00   100.00   100.00
Grafana                   100.00   100.00   100.00
CRM                       100.00   100.00   100.00
HR                        100.00   100.00   100.00
Allgemeine Definitionen   100.00   100.00   100.00
MLFlow                    100.00   100.00   100.00
DeepL                     100.00   100.00   100.00

[1m

                          ROC AUC
ERP                          1.

# Load and Classify

In [24]:
text = "Hi Niklas, Stefan said that you had an Azure DevOps license for your practice deployment at ITFB. What was the process there? Did you write an email to PE-AS? Hi Jonas, yes exactly, I still called Vanessa at that time and asked her for a license, she then passed the request to ITOW (I think that's the name of the group) and they then unlocked me. I think if you write a mail to PE-AS, it should work."

In [11]:
nlp = spacy.load('./model/model-best/')

In [25]:
doc = nlp(text)

In [32]:
cats = doc.cats
dl_predicted_tags = list(dict(filter(lambda x: x[1] > 0.5, cats.items())).keys())
cats

['HR']