In [16]:
import json

import pandas as pd
import numpy as np
import tensorflow as tf
import coremltools as ct
import keras
import regex as re

from keras.models import Sequential
from keras.layers import Dense, TextVectorization, Dropout, Flatten, Embedding, Conv1D, MaxPool1D, GlobalMaxPool1D
from keras.utils import plot_model
from tensorflow.keras import losses

from sklearn.model_selection import train_test_split
print("tf", tf.__version__)
print("keras", keras.__version__)
print("ct", ct.__version__)
print("np", np.__version__)

tf 2.14.1
keras 2.14.0
ct 7.2
np 1.26.4


In [2]:
labels = {
    'addStory': 0,
    'askQuestion': 1,
    'searchYourContacts': 2,
    'unknown': 3,
    'searchYourStories': 4
}

original = pd.read_csv('./data/preprocessed.csv')
#generated = pd.read_csv('./data/testing_dataset_10000_samples.csv')
#original['label'] = original['type'].map(labels)

In [3]:
original.head()

Unnamed: 0,prompt,label
0,How do I get to the nearest gas station?,1
1,Add a note about the time I visited Seattle.,0
2,Add a journal about the time I visited Seattle.,0
3,Add a note about the time I visited Chicago.,0
4,I need to note that I finished our job today.,0


In [4]:
def vectorize_labels(labels, dimension=5):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1

    return results

text_train, text_test, y_train, y_test = train_test_split(original['prompt'].tolist(), original['label'].tolist(), test_size=0.05, random_state=42)
y_train = vectorize_labels(y_train)
y_test = vectorize_labels(y_test)

print(text_train)
print(y_train)

['Give me the latest tech trends in a detailed manner.', "Please share Give me meeting details with Lucky Cement' event", 'Search this contacts for Emma Jamesson.', 'A car discovered a historical figure in a dream. Argentina is located in the southern part of South America.', 'Give me the latest tech trends with references.', 'Why do the population of Tokyo in a humorous way.', 'A stranger noticed an old friend at the top of a mountain. The Nile is the longest river in the world.', 'Add a record about the time I visited Miami.', 'A neighbor learned an old friend in a dream. Argentina is located in the southern part of South America.', 'A dog had lunch with the meaning of life in a bar. The Great Wall of China is visible from space.', 'A neighbor had lunch with a mysterious artifact in an ancient temple. Cars can go both slow and fast.', 'A stranger visited a rare animal at the local library. Cars can go both slow and fast.', 'Write a joke using analogies.', 'Tell me details about photo

In [5]:
MAX_SEQUENCE_LENGTH = 300

word_list = []
for prompt in text_train:
    word_parts = prompt.split()
    for word in word_parts:
        if word.__contains__("?"):
            if "?" not in word_list:
                word_list.append("?")

        part = re.sub(r'[^\w\s\d]', '', word.lower())
        if part.isnumeric() or "".__eq__(part):
            continue

        if part not in word_list:
            word_list.append(part)

print(word_list)

['give', 'me', 'the', 'latest', 'tech', 'trends', 'in', 'a', 'detailed', 'manner', 'please', 'share', 'meeting', 'details', 'with', 'lucky', 'cement', 'event', 'search', 'this', 'contacts', 'for', 'emma', 'jamesson', 'car', 'discovered', 'historical', 'figure', 'dream', 'argentina', 'is', 'located', 'southern', 'part', 'of', 'south', 'america', 'references', 'why', 'do', 'population', 'tokyo', 'humorous', 'way', 'stranger', 'noticed', 'an', 'old', 'friend', 'at', 'top', 'mountain', 'nile', 'longest', 'river', 'world', 'add', 'record', 'about', 'time', 'i', 'visited', 'miami', 'neighbor', 'learned', 'dog', 'had', 'lunch', 'meaning', 'life', 'bar', 'great', 'wall', 'china', 'visible', 'from', 'space', 'mysterious', 'artifact', 'ancient', 'temple', 'cars', 'can', 'go', 'both', 'slow', 'and', 'fast', 'rare', 'animal', 'local', 'library', 'write', 'joke', 'using', 'analogies', 'tell', 'photosynthesis', 'without', 'jargon', 'scientist', 'secret', 'recipe', 'amazon', 'largest', 'rainforest', 

In [6]:
word_dict = dict(zip(word_list, range(len(word_list))))
print(word_dict)

with open('dictData.json', 'w') as f:
    jsonDictData = list([{
        "key": int(idx),
        "word": x
    } for idx, x in enumerate(word_dict)])



    json.dump(jsonDictData, f)

{'give': 0, 'me': 1, 'the': 2, 'latest': 3, 'tech': 4, 'trends': 5, 'in': 6, 'a': 7, 'detailed': 8, 'manner': 9, 'please': 10, 'share': 11, 'meeting': 12, 'details': 13, 'with': 14, 'lucky': 15, 'cement': 16, 'event': 17, 'search': 18, 'this': 19, 'contacts': 20, 'for': 21, 'emma': 22, 'jamesson': 23, 'car': 24, 'discovered': 25, 'historical': 26, 'figure': 27, 'dream': 28, 'argentina': 29, 'is': 30, 'located': 31, 'southern': 32, 'part': 33, 'of': 34, 'south': 35, 'america': 36, 'references': 37, 'why': 38, 'do': 39, 'population': 40, 'tokyo': 41, 'humorous': 42, 'way': 43, 'stranger': 44, 'noticed': 45, 'an': 46, 'old': 47, 'friend': 48, 'at': 49, 'top': 50, 'mountain': 51, 'nile': 52, 'longest': 53, 'river': 54, 'world': 55, 'add': 56, 'record': 57, 'about': 58, 'time': 59, 'i': 60, 'visited': 61, 'miami': 62, 'neighbor': 63, 'learned': 64, 'dog': 65, 'had': 66, 'lunch': 67, 'meaning': 68, 'life': 69, 'bar': 70, 'great': 71, 'wall': 72, 'china': 73, 'visible': 74, 'from': 75, 'space

In [7]:
def vectorize_sample(sample):
    vector = np.zeros(MAX_SEQUENCE_LENGTH, dtype=np.int32)
    current_word_parts = sample.split()

    for (idx, current_part) in enumerate(current_word_parts):
        has_question_mark = False
        
        if idx == MAX_SEQUENCE_LENGTH:
            break
        
        if current_part.__contains__("?"):
            has_question_mark = True

        current_part = re.sub(r'[^\w\s\d]', '', current_part.lower())
        if current_part.isnumeric() or "".__eq__(current_part):
            continue

        if current_part in word_dict:
            word_index = word_dict[current_part]
            vector[idx] = word_index
            if has_question_mark:
                word_index = word_dict["?"]
                vector[idx + 1] = word_index
        else:
            vector[idx] = 0

    return vector

In [8]:
x_test = list(map(vectorize_sample, text_test))
x_train = list(map(vectorize_sample, text_train))

In [9]:
x_test = tf.convert_to_tensor(x_test)
x_train = tf.convert_to_tensor(x_train)

In [10]:
print(len(x_train), x_train.shape)
print(len(y_train))
print(len(x_test), x_train.shape)
print(len(y_test))

8736 (8736, 300)
8736
460 (8736, 300)
460


In [11]:
print(x_train[1])

tf.Tensor(
[10 11  0  1 12 13 14 15 16 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0], shape=(300,), dtype=int32)


In [86]:
model = Sequential([
    Embedding(input_dim=len(word_dict) + 1, output_dim=2048, input_length=MAX_SEQUENCE_LENGTH, mask_zero=True),
    Dropout(0.2),
    Conv1D(1024, 3, padding='same', activation='relu'),
    GlobalMaxPool1D(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])

model.compile(
    loss=losses.CategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])

model_history = model.fit(
    x=x_train,
    y=y_train,
    epochs=1,
    validation_data=(
        x_test, y_test
    )
)



In [20]:
print(vectorize_sample('Who was present at the product launch in New York City two weeks ago?'))

[ 130  292  372   49    2  636  260    6  163  669  177  524 2548 1094
  133    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [87]:
labels_inv = dict((v,k) for k,v in labels.items())

all_strings = [
    "Who should I follow up with after the conference last month?",
    "What introductions should I make between my contacts to add value?",
    "Who do I know at XYZ company that I can reach out to?",
    "Who are the key players in my network who specialize in AI?",
    "Who did I meet with at the last tech conference?",
    "What notes do I have from my last meeting with the Board of Directors?",
    "Who in my network is looking for investment opportunities in healthcare?",
    "Who is currently raising capital for a startup in my network?",
    "Who are the emerging leaders in my contacts list that I should mentor?",
    "Who in my network recently changed jobs or got promoted?",
    "Who are the most influential people I met at the latest conference?",
    "Who can introduce me to the CEO of ABC Corporation?",
    "Who in my network has experience with international expansions?",
    "Who should I connect with from my LinkedIn network to expand my influence in Asia?",
    "Who did I discuss the future of renewable energy with at the last seminar?",
    "Who has expertise in digital marketing that could help with our new campaign?",
    "Who are the potential partners in my network for our new project?",
    "Who in my contacts list has a background in cybersecurity?",
    "Who can provide insights into the latest trends in fintech?",
    "Who did I last discuss M&A opportunities with?",
    "Who in my network has connections to venture capital firms?",
    "Who do I need to send a follow-up email to after our last meeting?",
    "Who mentioned they were looking for a job in data science recently?",
    "Who should I introduce to our new COO to explore synergies?",
    "Who has a background in biotechnology within my network?",
    "Who are the people I should thank for their help with the recent product launch?",
    "Who did I have a conversation with about blockchain at the startup event?",
    "Who has been a valuable mentor to me from my contacts?",
    "Who do I know who specializes in leadership coaching?",
    "Who did I meet with in Toledo, Ohio, last year?",
    "Give me a bulleted list of important notes from the convention at McCormick Center in Chicago last month.",
    "Who was at the meeting in London last July about the new investment strategy?",
    "What events have I attended at the Westin Hotel in Boston?",
    "Who was present at the product launch in New York City two weeks ago?",
    "Who joined the breakfast meeting at Café Milano in D.C. last month?",
    "What were the key takeaways from the workshop at the Innovation Center in San Francisco last week?",
    "Who attended the annual fundraiser at the Ritz Carlton in November?",
    "Who did I meet with at the Marriott in San Diego earlier this year?",
    "Where did I have lunch meetings with prospective clients last month?",
    "What meetings did I have in Austin, Texas, during the SXSW festival?",
    "Where was the strategy session held with the marketing team last quarter?",
    "What events did I attend at the Hilton in Los Angeles last year?",
    "Who participated in the brainstorming session at the downtown office last Tuesday?",
    "Where was the panel discussion I spoke at during the tech conference in Las Vegas?",
    "What workshops did I attend at the convention center last month?",
    "Who attended the charity event at the Grand Hotel in December?",
    "Where did I meet with the legal team to discuss the merger?",
    "What was the location of the product demo I attended in March?",
    "Who did I have coffee with in the lobby of the Four Seasons last week?",
    "What events have I attended at the headquarters of XYZ Company?",
    "Where did I hold the annual strategy retreat last year?",
    "Who attended the networking event at the downtown club last Friday?",
    "Where was the last off-site meeting with the executive team held?",
    "What key meetings did I have at the client’s office last quarter?",
    "Where was the investor pitch event held last September?",
    "Who joined the dinner meeting at the Italian restaurant on Main Street last month?",
    "Where did I have a meeting about the new CRM tool last Tuesday?",
    "What client meetings have I had in San Francisco this year?",
    "Met with Jane Doe from ABC Corp to discuss potential partnership opportunities.",
    "Had lunch with John Smith who is looking to raise money for his fintech startup.",
    "Spoke with Lisa Ray about her new AI startup and potential synergies.",
    "Attended a workshop led by Mike Johnson on digital marketing trends.",
    "Discussed M&A strategies with Sarah Lee at the networking event.",
    "Met Carlos at a tech conference; he's a great contact for blockchain technology.",
    "Jane Doe mentioned she is interested in expanding her business internationally.",
    "Noted that John Smith has a wealth of experience in AI and machine learning.",
    "Attended the annual fundraiser where I reconnected with several key contacts.",
    "Had a brainstorming session with the team about our new product launch strategy.",
    "John mentioned he knows someone looking to invest in renewable energy projects.",
    "Met a venture capitalist, Susan Brown, who specializes in early-stage startups.",
    "Discussed potential mentorship opportunities with a rising star in our industry.",
    "Attended a roundtable on fintech innovations, met several key players.",
    "Met with the legal team to discuss the upcoming merger; important points to note.",
    "Dinner with CEO of XYZ Corporation at the Italian restaurant on Main Street; discussed partnership possibilities.",
    "Noted Jane's expertise in digital marketing could benefit our new campaign.",
    "Met with Carlos at the Netshets Lounge in Scotland who owns a distillery; discussed potential distribution.",
    "Discussed the future of AI in healthcare with Sarah Lee.",
    "Had coffee with John Smith in the lobby of the Four Seasons; talked about potential collaborations.",
    "Discussed cybersecurity trends with a contact from DEF Corp; potential speaker for our next conference.",
    "Attended a panel discussion at the tech conference in Las Vegas; met several influential people.",
    "Attended a workshop on leadership skills at the convention center; learned valuable tips.",
    "Met with the marketing team for a strategy session last quarter; key takeaways included.",
    "Attended an investor pitch event last September; met potential investors interested in our space.",
    "Lunch meeting with prospective clients last month; important follow-up needed.",
    "Discussed expansion plans with Lisa Ray who recently joined DEF Corp.",
    "Met with a potential partner, Michael Green, who has expertise in digital transformation.",
    "Talked to John about his new project in digital health; might be a good fit for our new initiative.",
    "Attended a charity event at the Grand Hotel; made valuable connections for our upcoming fundraising campaign.",
    "Discussed blockchain applications with Carlos, who owns the dune buggy place in the Dominican Republic; potential for collaboration on new projects."
]


samples_for_testing = tf.convert_to_tensor(list(map(vectorize_sample, all_strings)))
results = model.predict(samples_for_testing)

for idx, x in enumerate(results):
    label = x.argmax()
    print(all_strings[idx], ' | ', labels_inv[label])

Who should I follow up with after the conference last month?  |  askQuestion
What introductions should I make between my contacts to add value?  |  searchYourContacts
Who do I know at XYZ company that I can reach out to?  |  askQuestion
Who are the key players in my network who specialize in AI?  |  askQuestion
Who did I meet with at the last tech conference?  |  askQuestion
What notes do I have from my last meeting with the Board of Directors?  |  askQuestion
Who in my network is looking for investment opportunities in healthcare?  |  askQuestion
Who is currently raising capital for a startup in my network?  |  askQuestion
Who are the emerging leaders in my contacts list that I should mentor?  |  searchYourContacts
Who in my network recently changed jobs or got promoted?  |  askQuestion
Who are the most influential people I met at the latest conference?  |  askQuestion
Who can introduce me to the CEO of ABC Corporation?  |  askQuestion
Who in my network has experience with internation

In [88]:
input_shape = ct.Shape(shape=(1,300))
output_shape = ct.Shape(shape=(1, 5))

input_type = ct.TensorType(shape=input_shape, dtype=np.int32)
output_type = ct.TensorType(dtype=np.float32)

mlmodel = ct.convert(model, convert_to="mlprogram", source="tensorflow", inputs=[input_type], outputs=[output_type])

Running TensorFlow Graph Passes: 100%|██████████| 6/6 [00:00<00:00, 28.37 passes/s]
Converting TF Frontend ==> MIL Ops: 100%|██████████| 31/31 [00:00<00:00, 6136.07 ops/s]
Running MIL frontend_tensorflow2 pipeline: 100%|██████████| 7/7 [00:00<00:00, 3544.20 passes/s]
Running MIL default pipeline: 100%|██████████| 78/78 [00:00<00:00, 162.05 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 1184.69 passes/s]


In [89]:
mlmodel.save(f"CNN-Intent-Classifier-0.0.2.mlpackage")
