In [2]:

import pandas as pd
print(f"Pandas: {pd.__version__}")
import numpy as np
print(f"Numpy: {np.__version__}")

import tensorflow as tf
print(f"Tensorflow: {tf.__version__}")
from tensorflow import keras
print(f"Keras: {keras.__version__}")
import sklearn
print(f"Sklearn: {sklearn.__version__}")


import spacy
print(f'spaCy: {spacy.__version__}')
from spacy import displacy
import random
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path


import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)


from tqdm.notebook import tqdm
tqdm().pandas() 

import collections
import yaml
import pickle


with open(r'objects/intents.yml') as file:
    intents = yaml.load(file, Loader=yaml.FullLoader)


from tqdm.notebook import tqdm
tqdm().pandas() 

from IPython.core.display import display, HTML
    

train = pd.read_pickle('objects/train.pkl')

print(train.head())
print(f'\nintents:\n{intents}')

processed = pd.read_pickle('objects/processed.pkl')

Pandas: 1.4.1
Numpy: 1.22.3
Tensorflow: 2.8.0
Keras: 2.8.0
Sklearn: 1.0.2
spaCy: 3.2.4


0it [00:00, ?it/s]

0it [00:00, ?it/s]

  from IPython.core.display import display, HTML


                                           Utterance           Intent
0  [update, phone, io, battery, life, bad, consta...          battery
1                                 [forgot, my, pass]  forgot_password
2                            [payment, not, through]          payment
3                                     [want, update]           update
4                                [need, information]             info

intents:
{'battery': ['battery', 'power'], 'forgot_password': ['password', 'account', 'login'], 'payment': ['credit', 'card', 'payment', 'pay'], 'repair': ['repair', 'fix', 'broken'], 'update': ['update']}


In [3]:


entities = {'hardware': ['macbook pro', 'iphone', 'iphones', 'mac',
        'ipad', 'watch', 'TV', 'airpods','macbook'],
    'apps':['app store', 'garageband', 'books', 'calendar',
           'podcasts', 'notes', 'icloud', 'music', 'messages',
           'facetime','catalina','maverick']}


with open('objects/entities.yml', 'w') as outfile:
    yaml.dump(entities, outfile, default_flow_style=False)

In [4]:
entities

{'hardware': ['macbook pro',
  'iphone',
  'iphones',
  'mac',
  'ipad',
  'watch',
  'TV',
  'airpods'],
 'apps': ['app store',
  'garageband',
  'books',
  'calendar',
  'podcasts',
  'notes',
  'icloud',
  'music',
  'messages',
  'facetime',
  'catalina',
  'maverick']}

In [8]:
def offsetter(lbl, doc, matchitem):
    ''' Converts word position to string position, because output of PhraseMatcher returns '''
    one = len(str(doc[0:matchitem[1]]))
    subdoc = doc[matchitem[1]:matchitem[2]]
    two = one + len(str(subdoc))
    

    if one != 0:
        one += 1
        two += 1
    return (one, two, lbl)


offsetter('HARDWARE', nlp('hmm macbooks are great'),(2271554079456360229, 1, 2))

(4, 12, 'HARDWARE')

In [7]:

nlp = spacy.load('en_core_web_sm')


if 'ner' not in nlp.pipe_names:

    nlp.add_pipe("ner")


def spacify_row(document, label, entity_keywords):

    matcher = PhraseMatcher(nlp.vocab)


    for i in entity_keywords:
        matcher.add(label, None, nlp(i))


    nlp_document = nlp(document)
    matches = matcher(nlp_document)
    

    entity_list = [offsetter(label, nlp_document, match) for match in matches]
    

    return (document, {'entities': entity_list})

In [9]:

string_utterance = processed['Processed Inbound'].progress_apply(" ".join)


spacify_row('I love my macbook and my iphone', 'HARDWARE', 
            entity_keywords = entities.get('hardware'))

  0%|          | 0/76062 [00:00<?, ?it/s]

('I love my macbook and my iphone', {'entities': [(25, 31, 'HARDWARE')]})

In [10]:
entity_train = string_utterance.progress_apply(spacify_row,
                label = 'HARDWARE',              
                entity_keywords = entities.get('hardware'))

  0%|          | 0/76062 [00:00<?, ?it/s]

In [11]:

hardware_train = [(i,j) for i,j in entity_train if j['entities'] != []]


print(f'{len(hardware_train)} out of {len(entity_train)} Tweets contain a hardware entity')


pickle_out = open('objects/hardware_train.pkl', 'wb')
pickle.dump(hardware_train, pickle_out)

21155 out of 76062 Tweets contain a hardware entity


In [12]:
entity_train = string_utterance.progress_apply(spacify_row,
                label = 'APP',              
                entity_keywords = entities.get('apps'))


app_train = [(i,j) for i,j in entity_train if j['entities'] != []]


pickle_out = open('objects/app_train.pkl', 'wb')
pickle.dump(app_train, pickle_out)


print(f'{len(app_train)} out of {len(entity_train)} Tweets contain an app entity')

  0%|          | 0/76062 [00:00<?, ?it/s]

5147 out of 76062 Tweets contain an app entity


In [13]:
hardware_train[:5]

[('iphone yes io checked update none available swipe close app several time restart',
  {'entities': [(0, 6, 'HARDWARE')]}),
 ('phone app work thank update iphone ipod',
  {'entities': [(28, 34, 'HARDWARE')]}),
 ('word even mean thing iphone owner update late software drop call history glitch apps randomly opening ihelp',
  {'entities': [(21, 27, 'HARDWARE')]}),
 ('watchos make watch pointless browsing music phone via watch reason buying it useless',
  {'entities': [(13, 18, 'HARDWARE'), (54, 59, 'HARDWARE')]}),
 ('question iphone dy quick charge time day iphone battery faulty could',
  {'entities': [(9, 15, 'HARDWARE'), (41, 47, 'HARDWARE')]})]

In [14]:
app_train[:5]

[('show music store phone like want store music phone',
  {'entities': [(5, 10, 'APP'), (39, 44, 'APP')]}),
 ('watchos make watch pointless browsing music phone via watch reason buying it useless',
  {'entities': [(38, 43, 'APP')]}),
 ('update io iphone icloud backup greyed can not turn say last backup never',
  {'entities': [(17, 23, 'APP')]}),
 ('bug calendar app fix upgraded io still crash io',
  {'entities': [(4, 12, 'APP')]}),
 ('do apple music apple podcast regardless save phone stream',
  {'entities': [(9, 14, 'APP')]})]

In [25]:
for _, annotations in hardware_train:
        for ent in annotations.get('entities'):
                print(ent)

(0, 6, 'HARDWARE')
(28, 34, 'HARDWARE')
(21, 27, 'HARDWARE')
(13, 18, 'HARDWARE')
(54, 59, 'HARDWARE')
(9, 15, 'HARDWARE')
(41, 47, 'HARDWARE')
(10, 16, 'HARDWARE')
(20, 26, 'HARDWARE')
(58, 64, 'HARDWARE')
(0, 6, 'HARDWARE')
(41, 47, 'HARDWARE')
(21, 27, 'HARDWARE')
(0, 6, 'HARDWARE')
(55, 61, 'HARDWARE')
(27, 32, 'HARDWARE')
(51, 57, 'HARDWARE')
(13, 18, 'HARDWARE')
(7, 11, 'HARDWARE')
(10, 16, 'HARDWARE')
(20, 26, 'HARDWARE')
(0, 6, 'HARDWARE')
(11, 16, 'HARDWARE')
(18, 24, 'HARDWARE')
(42, 48, 'HARDWARE')
(41, 47, 'HARDWARE')
(29, 35, 'HARDWARE')
(7, 14, 'HARDWARE')
(27, 33, 'HARDWARE')
(39, 45, 'HARDWARE')
(0, 6, 'HARDWARE')
(13, 20, 'HARDWARE')
(7, 13, 'HARDWARE')
(36, 42, 'HARDWARE')
(22, 28, 'HARDWARE')
(9, 15, 'HARDWARE')
(13, 18, 'HARDWARE')
(52, 57, 'HARDWARE')
(29, 35, 'HARDWARE')
(0, 5, 'HARDWARE')
(33, 39, 'HARDWARE')
(13, 19, 'HARDWARE')
(0, 6, 'HARDWARE')
(13, 24, 'HARDWARE')
(10, 16, 'HARDWARE')
(21, 27, 'HARDWARE')
(40, 46, 'HARDWARE')
(34, 40, 'HARDWARE')
(7, 13, 'HA