In [1]:
import pandas as pd
import numpy as np

# Get the data

## Exemplar query look up
* 681-x-2649 matrix
* rows are query clusters
* columns are broader set of common queries
* values are the "strength" of the query within the cluster

In [2]:
masked_exemplar_query_mat = pd.read_csv('data/masked_exemplar_query_mat.csv').set_index('node_a')

## Query, click count, name, description
* This is our training set.
* Similar to first set, but `cnt_list` is a new field that tells how many clicks each query generated.
* Do you know why?

In [3]:
data = pd.read_csv('data/tagging_with_searches_2.csv', names=['id', 'queries', 'cnt_list', 'name', 'description'])
data.sample(20)

Unnamed: 0,id,queries,cnt_list,name,description
15281,46051274602,"bbq,memorial day",138,MEMORIAL DAY BBQ,<P>Join us at memorial day bbq with Chef Segun...
2276,43212769557,sneaker,27,SneakerManiaDC Summer 2018,<P>SneakerManiaDC is back all summer long!! Sn...
2008,41137402077,marketing,4,Online Marketing Masterclass,"<div>\r\n<p style=""font-family: arial, verdana..."
14833,44650707468,singles party,3,2018 Summer White Party,"<P CLASS=""MsoNormal""><SPAN><SPAN>Come dressed ..."
14824,46244930833,"islam,muslim",33,FREE IFTAR: JOURNEY OF THE SOUL,<P>Al Hikma Presents...JOURNEY OF THE SOUL: Fo...
7576,40100012216,architecture,4,"Drawing Perspective, Buildings and Architectur...","<p><span style=""font-family: arial, helvetica,..."
13021,46000519793,"house party,memorial day weekend events,memori...",33816,DAYLIGHT - DAY PARTY - MEMORIAL MONDAY -,<P>RL-ENTERTAINMENT PRESENTS</P>\n<P>DAYLIGHT ...
11824,30863348105,"anime convention,adult expo,anime,expo,anime expo",310513191,Anime Expo 2018 Registration,"<p style=""text-align: center;""><span style=""fo..."
7762,46419141903,model casting open call,4,AMERICAN EAGLE NATIONAL CASTING CALL FOR SPRIN...,"<P><SPAN><SPAN>From Thursday, May 24</SPAN><SP..."
16810,43801725139,kids,3,City Kids of San Francisco Black and White Gla...,"<P CLASS=""MsoNormal""><SPAN>Don’t miss the even..."


# Process label data

## create a matrix `raw_event_labels` 
* `num_events` long 
* `num_popular_queries` wide
* values are the number of times that event was clicked due to that query

In [4]:
# takes 10 s
shape = (data.shape[0], masked_exemplar_query_mat.shape[1])
raw_event_labels = pd.DataFrame(np.zeros(shape), columns = masked_exemplar_query_mat.columns)

for i, row in data.iterrows():
    queries = row['queries'].split(',')
    cnt_list = [int(i) for i in row['cnt_list'].split(',')]
    assert len(queries) == len(cnt_list), queries
    raw_event_labels.iloc[i][queries] = cnt_list

here's a subset

In [5]:
queries = ['bitcoin', 'blockchain', 'crypto', 'cryptocurrency']
ml_rows = np.where(data['queries'].str.split(',').apply(
    lambda qs: 
        any(q in queries for q in qs)
        and len(qs) > 2
))
raw_event_labels.iloc[ml_rows][queries]

Unnamed: 0,bitcoin,blockchain,crypto,cryptocurrency
672,0.0,7.0,0.0,0.0
1933,9.0,9.0,0.0,3.0
2111,0.0,5.0,0.0,0.0
3222,0.0,3.0,0.0,0.0
3694,0.0,13.0,3.0,0.0
4058,0.0,5.0,0.0,0.0
4080,0.0,3.0,0.0,0.0
5377,3.0,5.0,3.0,0.0
5567,0.0,5.0,0.0,0.0
6680,9.0,0.0,0.0,9.0


## collapse popular queries down to exemplar vocabulary

How:
* For each row, get the counts associate w/ each query.
* Look up each query in the masked_exemplar_query_mat and find its "exemplar query" and "strength"
* multiply the click counts and strengths together and sum up everything that belongs to the same exemplar

Sounds like a lot of nested `for` loops. But ***by the power of math*** it's a one liner matrix multiplication.

In [6]:
# cuz math is the coolest ¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸
event_labels = np.log((masked_exemplar_query_mat @ raw_event_labels.T).T + 1)
# ¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•*¨*•♫♪¸¸.•...oh yeah!

see if the label data makes sense

In [7]:
#test that the features map pretty well to the original queries
num_queries = 2
n = 0
raw_queries = list(masked_exemplar_query_mat.index)
for i in range(data.shape[0]):
    if n >= 7:
        break
    queries = data['queries'][i]
    queries = queries.split(',')
    if len(queries) < num_queries:
        continue
    if any([q in raw_queries for q in queries]):
        continue
    if i == 12:
        continue
    
    n += 1
    print(i)
    print('original queries: {}'.format(queries))
    print()
    print('labels:\n{}'.format(event_labels.iloc[i][event_labels.iloc[i]>0]))
    print('-------------')

33
original queries: ['yoga retreat', 'free yoga', 'yoga events']

labels:
node_a
yoga    2.510882
Name: 33, dtype: float64
-------------
39
original queries: ['vendors', 'vendor opportunities']

labels:
node_a
vendor    2.079442
Name: 39, dtype: float64
-------------
267
original queries: ['third space', 'third space fitstival']

labels:
node_a
lululemon    2.647592
Name: 267, dtype: float64
-------------
289
original queries: ['ozios', 'ozio']

labels:
node_a
rosebar    4.533943
Name: 289, dtype: float64
-------------
374
original queries: ['business networking', 'oil and gas']

labels:
node_a
energy        2.564949
networking    2.045074
Name: 374, dtype: float64
-------------
394
original queries: ['wedding expo', 'bloomingdales']

labels:
node_a
bridal show    2.696877
Name: 394, dtype: float64
-------------
504
original queries: ['blossom', 'haiti']

labels:
node_a
haitian    2.733221
shine      2.302585
Name: 504, dtype: float64
-------------


# Build training set
* Split to training and test sets.
* Features (X) works just like before.
* Labels (y) is different. Instead of binary vector, it's now a vector of floating point number.

In [8]:
# takes 5 seconds
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

XML_RE = re.compile(r'<[^>]+>|&\w+;')

titles = data['name']
bodies = data['description'].apply(lambda d: XML_RE.sub(' ', str(d)))
vect_tagses = event_labels

titles_train, titles_test, bodies_train, bodies_test, vect_tagses_train, vect_tagses_test = train_test_split(
    titles,
    bodies,
    vect_tagses,
    test_size=1000,
    random_state=42,
)  

title_processor = Pipeline([
    ('vect', CountVectorizer(stop_words='english', min_df=19)),
    ('tfidf', TfidfTransformer(use_idf=True)),
])
tfidf_titles_train = title_processor.fit_transform(titles_train)
tfidf_titles_test = title_processor.transform(titles_test)
    
body_processor = Pipeline([
    ('vect', CountVectorizer(stop_words='english', min_df=9)),
    ('tfidf', TfidfTransformer(use_idf=True)),
])
tfidf_bodies_train = body_processor.fit_transform(bodies_train)
tfidf_bodies_test = body_processor.transform(bodies_test)

dense_tfidf_titles_train = tfidf_titles_train.todense()
dense_tfidf_titles_test = tfidf_titles_test.todense()
dense_tfidf_bodies_train = tfidf_bodies_train.todense()
dense_tfidf_bodies_test = tfidf_bodies_test.todense()
dense_vect_tagses_train = vect_tagses_train
dense_vect_tagses_test = vect_tagses_test

X_train = np.concatenate((dense_tfidf_titles_train, dense_tfidf_bodies_train), axis=1)
X_test = np.concatenate((dense_tfidf_titles_test, dense_tfidf_bodies_test), axis=1)
X_test_raw = np.core.defchararray.add(np.core.defchararray.add(list(titles_test), '\n'), bodies_test)
    
y_train = dense_vect_tagses_train
y_test = dense_vect_tagses_test

# Build model

In [9]:
# https://keras.io/getting-started/faq/#what-does-sample-batch-epoch-mean
from keras.layers import Input, Dense
import keras.models
from keras.models import Model

model_file = 'data/tagging_with_searches_2/model'
try:
    model = keras.models.load_model(model_file)
except:
    inputs = Input(
        shape=(X_train.shape[1],), 
        dtype='float', 
        name='inputs',
    )
    middle = Dense(y_train.shape[1], activation='relu')(inputs)
    outputs = Dense(y_train.shape[1], name='outputs')(middle)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='rmsprop', loss='mean_squared_error')

    try:
        model.fit(
            X_train,
            y_train,
            epochs=250,
            batch_size=1000,
        )
    except:
        pass
    finally:
        save_response = input("Shall we save this?")
        if save_response in ('Y','y'):
            model.save(model_file)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Examine Results

In [10]:
def sample(i, limit):
    title = titles_test.iloc[i]
    body = bodies_test.iloc[i].replace('\n',' ') + "..."

    title_counter = title_processor.named_steps['vect']
    num_title_features = len(title_counter.vocabulary_)
    title_tokens = list(title_counter.inverse_transform(X_test[i,:num_title_features])[0])

    body_counter = body_processor.named_steps['vect']
    body_tokens = list(body_counter.inverse_transform(X_test[i,num_title_features:])[0])

    tag_vect = y_test.iloc[i]
    queries = list(event_labels.columns[np.where(tag_vect)[0]])
    
    predicted_tag_vect = model.predict(X_test[i])>limit
    predicted_tags = list(event_labels.columns[predicted_tag_vect[0]])
    #TODO print out values associated w/ predictions
    
    output = dict(
        title=title, 
        queries=queries,
        predicted_tags=predicted_tags,
        body=body, 
    )
    
    return output

limit = 0.6

for i in range(100):
    output = sample(i, limit)
        
    for k,v in output.items():
        print('{}={}'.format(k,v))
        print()
    print('---------------------------------------------------------------------\n')

title=2018 Foster Love 5K & 10K -Boston

queries=['5k run']

predicted_tags=['5k run']

body= FOSTER LOVE 5K   10K   *THIS IS A VIRTUAL RACE!             May is National Foster Care Month, and we want to help spread awareness with our Foster Love 5K   10K because Every Child Deserves A Family!  We are grateful for all of those families that open their homes to foster children and hope to help make a difference in making sure that all children have a place to call home!      We will also be donating at least 15% of each registration to  Foster Care to Success , which is the largest provider of college funding and support services for foster youth in the nation! We d love to raise enough to pay for a whole year of college for a foster youth!    *For a limited time get your official t-shirt and/or hoodie  HERE!      What:  This is a 5K  and 10K virtual run (or walk) so you choose your own course and time yourself.       When:  Complete your race any time in May 2018!      *Medals

title=The Gospel of St Michael's

queries=['gospel']

predicted_tags=['gospel']

body= Following on from their very sucessful Christmas 2017 concert, come and join the Gospel on The Rise community choir for their second event.    This evening promises to be exciting and inspirational and will feature guest artiste Sharlene-Monique. You wont want to miss it!       When: Saturday 26th May 2018   Time: Concert starts at 7PM (Doors open at 6.30 PM)   Price: £10.00  (+£1.85 Eventbrite booking Fee and 20% tax)         SEATING FOR THIS EVENT IS LIMITED SO WE ADVISE PURCHASING YOUR TICKET AS SOON AS POSSIBLE TO AVOID DISAPPOINTMENT.               FAQs         Do I have to bring my printed ticket to the event?    You can bring your printed ticket or show your ticket on a mobile device        Is my registration fee or ticket transferrable?    Yes you can transfer your ticket     ...

---------------------------------------------------------------------

title=Tipsy Trap Yoga: Cinco de Mayo Editi

title=Mermaid Swim & Cocktail Party

queries=['sex party']

predicted_tags=['memorial day weekend', 'memorial day weekend events', 'pool party']

body=      Adult Swim   Cocktail Party for Mermaid Weekend   Hosted by The Mermaid Atlantis  www.MermaidAtlantis.com     Friday May 18th    7-9 PM   Hyatt Regency Pool   1209 L St, Sacramento, CA 95814   18+ to attend, 21+ to purchase drinks     Tickets $20     Watch poolside entertainment, swim with mermaids, take a Shell-fie with a mermaid, and order food   cocktails from the Hyatt bar.     Food and cocktails available for purchace from the adjoing Hyatt restaurant and bar.   Special guests include: The Mermaid Atlantis  Merman Jax       Dark Tide Productions     FB event page        https://www.facebook.com/events/166320654075644/?active_tab=about     Pool info   https://    sacramento.regency.hyatt.co    m/en/hotel/our-hotel/   pool.html   Parking info   https://    sacramento.regency.hyatt.co    m/en/hotel/our-hotel/   map-and-directions

title=Dinner & a Comedy Show

queries=['stand up comedy']

predicted_tags=['comedy']

body= What's better than dinner   a movie? Dinner and a Comedy Show of course!    We are selling tickets to enjoy a delicious meal at our restaurant and attend the HideOut Comedy show at our underground bar. The  ticket costs $20  and includes a  $25 gift certificate  to our restaurant and  1   pass  to our 8pm HideOut Comedy show. Don't miss this awesome opportunity to dine in one of the oldest restaurants in Boston and experience the best basement comedy show in Faneuil Hall!   In the HideOut (bar underneath Durgin-Park Restaurant) every Friday and Sunday is HideOut Comedy, a stand-up comedy show featuring several local comedians. Each week is a new show with different comedians you'll be sure to enjoy. For more information on the comedy show follow them on Facebook at HideOut Comedy, Twitter @HideOutComedy or Instagram @HideOutComedy.   For more information on the event, please email: emotta@arkres

title=Legends Tribute Party

queries=['mothers day']

predicted_tags=['day party', 'party', 'sex party']

body= Let's celebrate the lives of some of the greatest artists in our time. Join us for a Legends Tribute, Mother's Day Weekend. All of your favorite songs from Michael Jackson, Prince, 2Pac, Biggie, Whitney Houston and the hottest 80's   90's music all night! In addition there will be a special performance by Verse Rare Laflare in celebration of his birthday! Entry is free before midnight! Early arrival is a must. Don't miss your chance to win tickets to see Charlie Wilson in Concert! Hosted by Beau Jones and Music by Tek 9 Movements.    Cigars   Hookah provided by Majestic Smokers of the QC ...

---------------------------------------------------------------------

title=The First Annual Fresno UNTZ Festival pre-party!

queries=['edm', 'rave party']

predicted_tags=['rave party']

body=  RaveForThought X The UNTZ present      The first annual Fresno UNTZ festival pre-party!     

title=Springtime Cherry Noir - Friday, June 15th, 2018 - BDSM and Swinger Fun

queries=['nude', 'sex']

predicted_tags=['day party', 'nude', 'sex']

body=   Mistress Zeneca Presents:  Cherry Noir, a sex-positive BDSM Club and Swingers Play Party       Friday, June 15th, 2018 9pm-3am         Class at 11pm - ANAL SEX: HOW TO MAKE IT FEEL GOOD            Our teacher for both days of Cherry Noir, SentientCanvas, will adjust the lesson plan based on the make up of the audience that shows up- like if everyone there has little to no anal experience then we are going to go over a lot of basic stuff. Our teacher is going to talk about anatomy, lubrication, toys, analingus, fingering, penetration. If there’s something specific you want to discuss, bring it up. Anything you ever wanted to know about the BUTT.                   Cherry Noir 'Classic' is now day 2 of our new 2-Night Cherry Noir events. All the kinky and sexy swinger adventures you can handle. We have our non-gendered Thursdays and C