In [1]:
import requests
import json

In [2]:
analysisSettings={}
mappingSettings={}

### Extract movies from tmbd.json

In [4]:
def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read());

### Indexing with Bulk API

In [8]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "number_of_ shards": 1,
        "index": {
            "analysis": analysisSettings,
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
        
    resp = requests.delete('http://192.168.32.10:9200/tmdb')
    resp = requests.put('http://192.168.32.10:9200/tmdb', data=json.dumps(settings))
    bulkMovies = ""
    for id, movie in movieDict.iteritems():
        addCmd = {"index": {"_index": "tmdb",
                           "_type": "movie",
                           "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"
    resp = requests.post("http://192.168.32.10:9200/_bulk", data=bulkMovies)

### Pulling data into Elasticsearch

In [9]:
movieDict = extract()
reindex(movieDict=movieDict)

### Search Function

In [11]:
def search(query):
    url = 'http://192.168.32.10:9200/tmdb/movie/_search'
    httpResp = requests.get(url, data=json.dumps(query))
    searchHits = json.loads(httpResp.text)['hits']
    print "Num\tRelevance Score \t\tMovie Title"
    for idx, hit in enumerate(searchHits['hits']):
        print "%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title'])

### Search query

In [16]:
usersSearch = 'william shatner patrick stewart'
query = {
    "query": {
        "multi_match": {
            "query": usersSearch,
            "fields": ["overview", "title", "directors.name", "cast.name"],
            "type": "cross_fields"
        }
    }
}
search(query)

In [17]:
search(query)

Num	Relevance Score 		Movie Title
1	0.92778635		Star Trek V: The Final Frontier
2	0.7915544		Star Trek: Generations
3	0.4553907		Dark Skies
4	0.42914188		Showtime
5	0.3941992		Osmosis Jones
6	0.3442495		Bill & Ted's Bogus Journey
7	0.34294215		The Wild
8	0.33406225		Star Trek: Nemesis
9	0.3218564		Miss Congeniality 2: Armed and Fabulous
10	0.29619905		Miss Congeniality


### 7.2 Boosting with additional Boolean clause

In [19]:
usersSearch = 'william shatner patrick stewart'
query = {
    "query": {
        "bool": {
            "should": [
                {"multi_match": {
                        "query": usersSearch,
                        "fields": ["overview", "title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }},
                    {
                    "match_phrase": {
                        "title": {
                            "query": "star trek",
                            "boost": 0.1
                        }}}
                    ]
        }
    },
}
search(query)

Num	Relevance Score 		Movie Title
1	1.3519554		Star Trek V: The Final Frontier
2	1.2357156		Star Trek: Generations
3	0.9266564		Star Trek: Nemesis
4	0.6770382		Star Trek: First Contact
5	0.6341759		Star Trek
6	0.6201064		Star Trek: Insurrection
7	0.618581		Star Trek: The Motion Picture
8	0.6083424		Star Trek II: The Wrath of Khan
9	0.5362432		Star Trek IV: The Voyage Home
10	0.5362432		Star Trek III: The Search for Spock


### 7.3 Applying a mutliplier for Star Trek Movies

In [24]:
usersSearch = 'william shatner patrick stewart'
query = {
    "query": {
        "function_score": {
            "query": {
                "multi_match": {
                        "query": usersSearch,
                        "fields": ["overview", "title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                },
                "functions": [
                    {
                        "weight": 2.5,
                        "filter": {
                            "query": {
                                "match_phrase": {
                                    "title": "star trek"
                                }
                            }
                        }
                    }
                ]
            }
        }
    }



search(query)

Num	Relevance Score 		Movie Title
1	2.3194659		Star Trek V: The Final Frontier
2	1.978886		Star Trek: Generations
3	0.8351556		Star Trek: Nemesis
4	0.68567824		Star Trek II: The Wrath of Khan
5	0.5713986		Star Trek: The Motion Picture
6	0.5599847		Star Trek IV: The Voyage Home
7	0.5599847		Star Trek III: The Search for Spock
8	0.50876653		Star Trek: Insurrection
9	0.4571188		Star Trek VI: The Undiscovered Country
10	0.4553907		Dark Skies


### 7.4 Filtering instead of boosting Star Trek results

In [25]:
usersSearch = 'william shatner patrick stewart'
query = {
    "query": {
        "bool": {
            "should": [{
                "multi_match": {
                        "query": usersSearch,
                        "fields": ["overview", "title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                }],
            "filter": [{
                    "query": {
                        "match_phrase": {
                            "title": "star_trek"
                        }
                    }
                }]
            }
        }
    }



search(query)

Num	Relevance Score 		Movie Title


### 7.5 Change Reindexation

In [41]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "number_of_ shards": 1,
        "index": {
            "analysis": analysisSettings,
        }
    }
    if mappingSettings:
        settings['mappings'] = mappingSettings
        
    resp = requests.delete('http://192.168.32.10:9200/tmdb')
    resp = requests.put('http://192.168.32.10:9200/tmdb', data=json.dumps(settings))
    bulkMovies = ""
    for id, movie in movieDict.iteritems():
        esDoc = movie
        transform(esDoc)
        addCmd = {"index": {"_index": "tmdb",
                           "_type": "movie",
                           "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"
    resp = requests.post("http://192.168.32.10:9200/_bulk", data=bulkMovies)

In [40]:
SENTINEL_BEGIN = 'SENTINEL_BEGIN'
SENTINEL_END = 'SENTINEL_END'
def transform(esDoc):
    esDoc['title_exact_match'] = SENTINEL_BEGIN + ' ' + \
                                    esDoc['title'] + ' ' + SENTINEL_END
    esDoc['names_exact_match'] = []
    for person in esDoc['cast'] + esDoc['directors']:
        esDoc['names_exact_match'].append(SENTINEL_BEGIN + ' ' + person['name'] + ' ' + SENTINEL_END)

In [42]:
reindex(analysisSettings, mappingSettings, movieDict)

### 7.6 Isolated testing of your exact-match signal

In [36]:
usersSearch = "star trek"
query = {
    "query": {
        "match_phrase": {
            "title_exact_match": {
                "query": SENTINEL_BEGIN + ' ' + \
                            usersSearch + ' ' + SENTINEL_END,
                "boost": 0.1
            }
        }
    }
}
search(query)

Num	Relevance Score 		Movie Title
1	7.101176		Star Trek


### 7.8 Boolean Boost on exact-title matching

In [51]:
usersSearch = "Good will hunting"
query = {
    "query": {
        "bool": {
            "disable_coord": True,
            "should": [
               { "match_phrase": {
                    "title_exact_match": {
                        "query": SENTINEL_BEGIN + ' ' + \
                        usersSearch + ' ' + SENTINEL_END,
                        "boost": 1000
                    }
                }},
                {"multi_match": {
                        "query": usersSearch,
                        "fields": ["overview", "title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }                  
                },
            ]
        }
    }
}
search(query)

Num	Relevance Score 		Movie Title
1	9.215305		Good Will Hunting
2	0.0003364306		As Good as It Gets
3	0.00025581344		Good Night, and Good Luck.
4	0.00020672847		A Good Year
5	0.00020672847		The Good Shepherd
6	0.00020672847		The Good Lie
7	0.00017033581		Good Morning, Vietnam
8	0.00017033581		All Good Things
9	0.00017033581		A Few Good Men
10	0.00016794368		Good People


### 7.9 Adding a clause for bigrammed matches (base query not shown)

In [53]:
usersSearch = "star trek"
query = {
    "query": {
        "bool": {
            "should": [
               { "multi_match": {
                        "query": usersSearch,
                        "fields": ["overview", "title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                }},
                {"multi_match": {
                        "query": usersSearch,
                        "fields": ["directors.name.bigrammed", "cast.name.bigrammed"],
                        "type": "cross_fields",
                        "boost": 100
                    }                  
                },
            ]
        }
    }
}
search(query)

Num	Relevance Score 		Movie Title
1	0.013334134		Star Trek
2	0.011406989		Star Trek: Nemesis
3	0.010667307		Star Trek: First Contact
4	0.009775436		Star Trek: Insurrection
5	0.009775436		Star Trek Into Darkness
6	0.008882249		Star Trek: Generations
7	0.008355431		Star Trek V: The Final Frontier
8	0.007771968		Star Trek: The Motion Picture
9	0.0073315767		Star Trek IV: The Voyage Home
10	0.0073315767		Star Trek III: The Search for Spock
