In [2]:
import pandas as pd

# Get the data
* Find the most common 500 queries
* Find all the requests that use the common queries
* Find all the events clicked on from those requests
* Find the name and the description of those events

In [3]:
data = pd.read_csv('data/tagging_with_searches_1.tsv', names=['id','queries', 'title', 'body'], sep='\t')
data.head(20)

Unnamed: 0,id,queries,title,body
0,44792068,blockchain,Blockchain Smart Panels,"<h3 class=""MsoNormal""><br /></h3>\r\n<h3 class..."
1,45819106,"makeup classes,beauty",NARS Summer Mega Event,<P><SPAN>Join NARS Cosmetics at Nordstrom Cent...
2,45052684,"wine tasting events,wine tasting",Italian Wine Tasting with Uggiano Winery,"<P CLASS=""MsoNormal"">Nestled in the countrysid..."
3,44801981,soca,TURN UP FRIDAYS @ KINANM,<P>Turn up fridays @kinanm lounge... $100 2-4-...
4,45775216,blockchain,Raw Haus: Design x Blockchain: Identity Manage...,"<P>In the third of our<SPAN> series,</SPAN> we..."
5,45614048,"wine tasting events,wine tasting","APPELLATION - Wine Tasting, Meet the Makers",<P><SPAN>There is really nothing more divisive...
6,43253968,street fairs,Alma Street Fair 2018-Stall reservations,"<p style=""text-align: center;""><span style=""fo..."
7,45610786,volunteer events,TPS Cooking for Hope at Ronald McDonald House ...,<P><SPAN>Back by popular demand! </SPAN><SPAN>...
8,45344993,yoga,Sunday Yoga,<P>Join us for a complimentary yoga practice w...
9,45639611,yoga,Rooftop Yoga benefitting the Alzheimer's Assoc...,<P><SPAN>Join members of the Alzheimer's Assoc...


# Create training set

* Remove XML and then set asside a test set.

In [4]:
import re
import numpy as np

from sklearn.model_selection import train_test_split

XML_RE = re.compile(r'<[^>]+>|&\w+;')

titles = data['title']
bodies = data['body'].apply(lambda d: XML_RE.sub(' ', d))
tagses = data['queries']

titles_train, titles_test, bodies_train, bodies_test, tagses_train, tagses_test = train_test_split(
    titles,
    bodies,
    tagses,
    test_size=100,
    random_state=42,
)  

## Prepare the feature vectors, `X`
* Process `title` and `body` fields into sparse TF*IDF vectors.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

title_processor = Pipeline([
    ('vect', CountVectorizer(stop_words='english', min_df=19)),
    ('tfidf', TfidfTransformer(use_idf=True)),
])
tfidf_titles_train = title_processor.fit_transform(titles_train)
tfidf_titles_test = title_processor.transform(titles_test)

body_processor = Pipeline([
    ('vect', CountVectorizer(stop_words='english', min_df=9)),
    ('tfidf', TfidfTransformer(use_idf=True)),
])
tfidf_bodies_train = body_processor.fit_transform(bodies_train)
tfidf_bodies_test = body_processor.transform(bodies_test)

* Convert them into dense vectors suitable for training Neural Networks.

In [6]:
dense_tfidf_titles_train = tfidf_titles_train.todense()
dense_tfidf_titles_test = tfidf_titles_test.todense()
dense_tfidf_bodies_train = tfidf_bodies_train.todense()
dense_tfidf_bodies_test = tfidf_bodies_test.todense()

* Concatenate `title` and `body` vectors into a single vector.

In [7]:
X_train = np.concatenate((dense_tfidf_titles_train, dense_tfidf_bodies_train), axis=1)
X_test = np.concatenate((dense_tfidf_titles_test, dense_tfidf_bodies_test), axis=1)
X_test_raw = np.core.defchararray.add(np.core.defchararray.add(list(titles_test), '\n'), bodies_test)

Let's take a look at the results.

In [8]:
print('X_train.shape:', X_train.shape, '\n')
print('num non-zero elements:', np.count_nonzero(X_train[0]), '\n')
row_i, col_i = np.nonzero(X_train[0])
print('non-zero elements:\n', X_train[0][row_i, col_i])

X_train.shape: (14422, 13877) 

num non-zero elements: 54 

non-zero elements:
 [[0.25926518 0.59145486 0.47701804 0.46712164 0.37042393 0.05991176
  0.10517516 0.12397836 0.10993582 0.11624109 0.09743092 0.1096012
  0.09387606 0.14281727 0.21403938 0.13223437 0.08779476 0.13270574
  0.09329427 0.16406408 0.07798627 0.09499815 0.13404607 0.03956563
  0.11790491 0.0859697  0.0881039  0.13549669 0.16867474 0.1439813
  0.086571   0.10815557 0.12839434 0.57204539 0.12462782 0.11430594
  0.12944188 0.17296817 0.17734905 0.06020619 0.06602291 0.09752642
  0.09932067 0.07767455 0.14521064 0.09534196 0.16971504 0.07612504
  0.10226218 0.10854766 0.1193775  0.07632421 0.15624812 0.06046931]]


## Prepare target vectors, `y`
* Process the queries into sparse binary vectors.
* Convert them to dense vectors.

In [9]:
tags_processor = CountVectorizer(analyzer=lambda q: q.split(','))
vect_tagses_train = tags_processor.fit_transform(tagses_train)
vect_tagses_test = tags_processor.transform(tagses_test)

y_train = vect_tagses_train.todense()
y_test = vect_tagses_test.todense()
y_test_raw = tagses_test

Let's take a look at the results.

In [10]:
print('y_train.shape:', y_train.shape, '\n')
print('num non-zero elements:', np.count_nonzero(y_train[2]), '\n')
row_i, col_i = np.nonzero(y_train[2])
print('non-zero elements:\n', y_train[2][row_i, col_i])

y_train.shape: (14422, 449) 

num non-zero elements: 2 

non-zero elements:
 [[1 1]]


# Build Model

In [11]:
from keras.layers import Input, Dense
from keras.models import load_model, Model

model_file = 'models/tagging_with_searches_1.mdl'
try:
    model = load_model(model_file)
except OSError:
    inputs = Input(
        shape=(X_train.shape[1],), 
        dtype='float', 
        name='inputs',
    )
    middle = Dense(y_train.shape[1], name='middle', activation='relu')(inputs)
    outputs = Dense(y_train.shape[1], name='outputs')(middle)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='rmsprop', loss='mean_squared_error')
    
    model.fit(X_train, y_train, epochs=20)
    model.save(model_file)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# See how well it works!

In [12]:
def sample(i, limit=None, top_n=None):
    if limit:
        predicted = sorted(list(tags_processor.inverse_transform((model.predict(X_test[i])>limit)+0.0))[0])
    elif top_n: 
        pass
    else:
        raise Exception('Either limit or top_n must be specified')
    print('predicted: {}'.format(predicted))
    print('truth: {}'.format(list(y_test_raw)[i]))
    print('original text:\n{}\n'.format(list(X_test_raw)[i]))
    print('----------------------------------------------')
 
for i in range(100):
    sample(i, limit = 0.2)

predicted: ['beauty']
truth: beauty
original text:
Around The World Beauty "Beauty + Travel Meet-up"
   L et's  meet-up after hours to celebra

----------------------------------------------
predicted: ['art', 'business', 'meditation', 'mindfulness']
truth: leadership seminars
original text:
The Art of Mindful Leadership
  M indfulness has become a buzz word, and for good reason. T

----------------------------------------------
predicted: ['vendor', 'vendors needed']
truth: bridal show
original text:
Bakersfield&#39;s Premier Bridal Show 
 Join us for Bakersfield's Premier Bridal Show!  
  

----------------------------------------------
predicted: ['5k run']
truth: 5k run
original text:
Flamingo Day 5K -Phoenix
 FLAMINGO DAY 5K 
 *THIS IS A VIRTUAL RACE! 
 
     
 Did you 

----------------------------------------------
predicted: ['memorial day']
truth: memorial day events
original text:
Keepin It Grown & Sexy
  Houston's Ultimate Couples Day  Party!     This event will be at F


predicted: ['fashion', 'vegan']
truth: fashion
original text:
Bare Fashion
   Award-winning magazine  Vegan Food   Living   is teaming up with the fabulo

----------------------------------------------
predicted: ['business networking']
truth: networking
original text:
New Tampa Business Networking 
  The only thing better than connecting top professionals in

----------------------------------------------
predicted: ['venture capital']
truth: venture capital
original text:
New York 2018 Venture Capital World Summit
     
  These events are   the Venture Capital 

----------------------------------------------
predicted: []
truth: entrepreneur
original text:
Entrepreneurship: The Highs, The Lows (and The Pivots!)
 Thinking about starting your own b

----------------------------------------------
predicted: ['swinger party', 'swingers party']
truth: swinger party
original text:
Swinger 101 - Couples Only
 New to the lifestyle?  Just curious about being a swinger? 
 S

----------------