In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth',1000)

# Get the data
`node_a` and `node_b` are query strings and `edge_weight` is the number of times that they occurred in the same session

#### _Note: method here adapted from work by colleague Ryan Carr (thanks!)_

In [2]:
synonym_graph = pd.read_csv('data/synonyms_by_session.csv', names=['node_a', 'node_b', 'edge_weight'])
synonym_graph.head(20)

Unnamed: 0,node_a,node_b,edge_weight
0,memorial day weekend,memorial day weekend events,4976
1,memorial day weekend events,memorial day weekend,4976
2,memorial day events,memorial day weekend events,3164
3,memorial day weekend events,memorial day events,3164
4,memorial day events,memorial day weekend,1969
5,memorial day weekend,memorial day events,1969
6,job fair,job fairs,1331
7,job fairs,job fair,1331
8,car show,car shows,1287
9,car shows,car show,1287


# Normalize the edge weight
Goal: create a notion of edge_weight that discounts trivial "popular query" relationships.

`norm_edge_weight = edge_weight / node_b_count`

In [3]:
num_times_node_b_is_with_another_query = synonym_graph.groupby('node_a').agg({'node_b':'count'})
synonym_graph_norm = synonym_graph.set_index('node_a')\
    .join(num_times_node_b_is_with_another_query, rsuffix='_count')\
    .reset_index()
synonym_graph_norm['norm_edge_weight'] = synonym_graph_norm.edge_weight / (synonym_graph_norm.node_b_count)
synonym_graph_norm.sort_values('norm_edge_weight', ascending=False).head(10)

Unnamed: 0,node_a,node_b,edge_weight,node_b_count,norm_edge_weight
8431,memorial day events,memorial day weekend events,3164,10,316.4
4596,earth day festival,earth day events,412,2,206.0
8821,mother,mothers day,617,3,205.666667
8432,memorial day events,memorial day weekend,1969,10,196.9
9045,mothers day brunch,mothers day,534,3,178.0
5229,father,fathers day,174,1,174.0
14369,yog,yoga,159,1,159.0
2474,carnival dates,carnival,158,1,158.0
3793,curl fest,curlfest,149,1,149.0
2977,cinco de mayo festival,cinco de mayo events,742,5,148.4


# Create adjacency matrix of query text
Rows and columns corresponding to every term. Values corresponding to the normalized edge weight.

In [4]:
query_mat = pd.crosstab(
    synonym_graph_norm['node_a'],
    synonym_graph_norm['node_b'],
    synonym_graph_norm['norm_edge_weight'],
    aggfunc='sum',
).fillna(0)
query_mat.shape

(2649, 2649)

Let's look at a subset of the matrix.

In [5]:
queries = ['memorial day weekend events', 'memorial day weekend', 'memorial day events', 'mother', 'mothers day', 'mothers day brunch']
query_mat.loc[queries][queries]

node_b,memorial day weekend events,memorial day weekend,memorial day events,mother,mothers day,mothers day brunch
node_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
memorial day weekend events,0.0,105.87234,67.319149,0.0,1.234043,0.0
memorial day weekend,73.176471,0.0,28.955882,0.0,0.794118,0.0
memorial day events,316.4,196.9,0.0,0.0,2.1,0.0
mother,0.0,0.0,0.0,0.0,205.666667,20.666667
mothers day,0.271028,0.252336,0.098131,2.883178,0.0,2.495327
mothers day brunch,0.0,0.0,0.0,20.666667,178.0,0.0


# Identify query clusters using "Affinity Propagation"

In [6]:
from sklearn.cluster import AffinityPropagation
from sklearn.externals import joblib

query_affinity_file = 'data/query_string_clusters/query_affinity.mdl'
try:
    aff = joblib.load(query_affinity_file)
    labels = aff.labels_
except Exception as e: 
    aff = AffinityPropagation(
        damping=.8,  
        max_iter=200, 
        convergence_iter=20, 
        affinity='precomputed',
    )
    labels = aff.fit_predict(query_mat)
    joblib.dump(aff, query_affinity_file)
    
print('labels:', labels)

labels: [138   3   4 ... 679 680  41]


every query string gets a cluster number that is stored in `labels`

let's collect all the queries together according to their label

In [7]:
query_families = pd.DataFrame(
    list((zip(labels, query_mat.index))),
    columns=['cluster_number', 'queries'],
).groupby(
    'cluster_number'
).agg({'queries':lambda x: list(x)})
query_families.sample(10)

Unnamed: 0_level_0,queries
cluster_number,Unnamed: 1_level_1
172,"[continuing education for social workers, social work ceu mn]"
365,"[jazz concert, jazz festival, jazz music]"
459,"[fashion week casting call, league of legends, lol, msi]"
667,"[empowerment, female, feminism, feminist, gender, girls, ladies, woman, wome, women, women business, women conference, women empowerment, women in business, womens, womens conference, womens conferences, womens empowerment, womens events, womens health, womens retreat]"
497,"[pizza zoo, pizzazoo]"
581,"[spill fest, spillfest]"
511,"[pride festival, pride weekend]"
603,"[ios, swift]"
593,[sts]
61,"[baauer, bauuer]"


`aff.cluster_centers_indices_` indicates which query is the "center" or "exemplar" of each cluster.

Pull the exemplar cluster into a new column.

Notice:
* The queries in the cluster make sense.
* The exemplars are the "best" of the cluster.

In [12]:
# add in colums for the number of queries
query_families['num_queries'] = query_families['queries'].apply(lambda x: len(x))
query_families['exemplar'] = query_mat.index[aff.cluster_centers_indices_]

query_families.sample(10)

Unnamed: 0_level_0,queries,num_queries,exemplar
cluster_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
68,"[beaut, beauty, nails, selfridges, skin, skincare, spa]",7,beauty
53,"[babies, baby, baby expo, infant, mom]",5,baby
402,"[lib, lightning, lightning in a bottle]",3,lightning in a bottle
256,"[essence festival all white party, essence music festival all white party, metropolitan all white party]",3,essence festival all white party
394,"[human rights, law, lawyer, legal]",4,law
658,"[cannibus, i71, puff, smoke, wee, weed]",6,weed
230,[drag world],1,drag world
51,"[autograph, meet greet]",2,autograph
316,"[balayage, braids, hair, wig]",4,hair
208,"[decades, decades dc, decades sunday]",3,decades


# Reshaping the data to build a better tagging model
We see that the affinity analysis appears to be working, but we need to reshape the data so that we can use it.

Requirement: given a *raw query string* we need to know 2 things
1. What is the exemplar query string for this query?
2. How "strong" is this query in relation to it's exemplar?

In [9]:
# get portion of query_mat that corresponds to the exemplars
exemplar_query_mat = query_mat.iloc[aff.cluster_centers_indices_]

# get artifical max score for each query (TODO improve)
query_score = synonym_graph_norm.groupby('node_a').agg({'norm_edge_weight': 'max'}) * 1.1

# create version of exemplar_query_mat that zeros out all the values that don't correspond to clustered queries
masked_exemplar_query_mat = exemplar_query_mat.copy()*0

for i, row in query_families.iterrows():
    masked_exemplar_query_mat.loc[row['exemplar']][row['queries']] = 1
 
masked_exemplar_query_mat = masked_exemplar_query_mat * exemplar_query_mat

for i, row in query_families.iterrows():
    query_text = row['exemplar']
    masked_exemplar_query_mat.loc[query_text][query_text] = query_score.loc[query_text]

In [10]:
queries = ['machine learning', 'artificial intelligence', 'bitcoin', 'block', 'block chain', 'blockchai', 'deep learning', 'python', 'blockchain', 'blockchain week', 'tensorflow', 'consensus', 'crypto','cryptocurrency', 'ethereum']

masked_exemplar_query_mat.loc[queries][queries]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


node_b,machine learning,artificial intelligence,bitcoin,block,block chain,blockchai,deep learning,python,blockchain,blockchain week,tensorflow,consensus,crypto,cryptocurrency,ethereum
node_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
machine learning,11.733333,10.666667,0.0,0.0,0.0,0.0,6.8,6.4,0.0,0.0,1.8,0.0,0.0,0.0,0.0
artificial intelligence,,,,,,,,,,,,,,,
bitcoin,,,,,,,,,,,,,,,
block,,,,,,,,,,,,,,,
block chain,,,,,,,,,,,,,,,
blockchai,,,,,,,,,,,,,,,
deep learning,,,,,,,,,,,,,,,
python,,,,,,,,,,,,,,,
blockchain,0.0,0.0,6.0,0.361905,0.6,0.466667,0.0,0.0,9.774286,0.295238,0.0,0.438095,8.885714,4.47619,1.171429
blockchain week,,,,,,,,,,,,,,,


save the matrices for later

In [11]:
import os.path

masked_exemplar_query_mat_file = 'data/masked_exemplar_query_mat.csv'
exemplar_query_mat_file = 'data/exemplar_query_mat.csv'

if not os.path.isfile(masked_exemplar_query_mat_file):
    masked_exemplar_query_mat.to_csv(masked_exemplar_query_mat_file)
if not os.path.isfile(exemplar_query_mat_file):
    exemplar_query_mat.to_csv(exemplar_query_mat_file)