In [16]:
# Before going to the code section, make sure that the elastic search is running in your machine.
# And there are two shell scripts:
# 1. "return_all.sh" which returns all the data for the index
# 2. "del_indices.sh" which removes all the data 

# We will be using "del_indices.sh" manually for clearning out all the indexes.

In [17]:
import requests
import json
import pandas as pd

In [18]:
# Using the same data which I scraped
df = pd.read_csv("imdb_thriller_movies_list.csv", encoding="utf-8")

# clean the year column
def parse_year(v):
    v = v.replace("(",  "")
    v = v.replace(")", "")
    return v

df['movie_year'] = df['movie_year'].apply(lambda v: parse_year(v))
df['movie_year'] = df['movie_year'].astype(int)

In [19]:
df.columns

Index([u'image_url', u'movie_cast', u'movie_description', u'movie_director',
       u'movie_hero', u'movie_rating', u'movie_title', u'movie_votes',
       u'movie_year', u'rank'],
      dtype='object')

In [20]:
# Now to make things simple, lets take a copy of the data with only the required columns in it
df_subset = df[['movie_title', 'movie_year', 'movie_description', 'movie_hero', 'movie_director', 'movie_votes']]

In [21]:
# Lets do some insertion
# Function which insert docs into elasticsearch
def create_doc(uri, doc_data={}):
    """Create new document."""
    query = json.dumps(doc_data)
    response = requests.post(uri, data=query)

In [22]:
# uri_create points to my localhost where movies is the index and explore is the type
uri_create = 'http://localhost:9200/movies/explore/'
for idx, row in df_subset.iterrows():
    create_doc(uri_create, df_subset.ix[idx].to_dict())

In [23]:
# create a function which helps to loop through json and prints the movie title alone
def format_results(results):
    data = [doc for doc in results['hits']['hits']]
    for doc in data:
        print doc['_source']['movie_title']

In [24]:
# print all the movies 
all_movies = requests.get("http://localhost:9200/movies/_search/?size=1000&pretty=1>")
all_movies = json.loads(all_movies.text)
format_results(all_movies)

The Dark Knight Rises
Chinatown
Mystic River
The Sixth Sense
Apocalypto
V for Vendetta
127 Hours
North by Northwest
Gravity
Munich
The Devil's Advocate
Captain Phillips
Elephant
Saw
Jackie Brown
Blood Diamond
Blue Velvet
Panic Room
Shutter Island
The Fugitive
Ghost Dog: The Way of the Samurai
Oldboy
Run Lola Run
Twelve Monkeys
Mad Max: Fury Road
The Girl with the Dragon Tattoo
Casino Royale
The Untouchables
Vertigo
The Lives of Others
Die Hard
The Unknown Woman
Blade Runner
Death Proof
Heat
Mulholland Dr.
Se7en
Sin City
The Others
Collateral
Jaws
The Revenant
Prisoners
Fargo
Lost Highway
JFK
The Secret in Their Eyes
Ronin
Sherlock Holmes
Desperado
A History of Violence
Nightcrawler
Inception
The Departed
Léon: The Professional
Kill Bill: Vol. 1
No Country for Old Men
Crash
Minority Report
The Prestige
The Usual Suspects
The Game
The Ghost Writer
Children of Men
Sweeney Todd: The Demon Barber of Fleet Street
Taken
Mr. Brooks
The Dark Knight
The Bourne Ultimatum
Black Swan
The Silence of

In [25]:
# Lets do some searching
# create a simple search function where key: value combination will be the query
def search(uri, col_name, term):
    """Simple Elasticsearch Query"""
    query = json.dumps({
        "query": {
            "match": {
                col_name: term
            }
        }
    })
    response = requests.get(uri, data=query)
    results = json.loads(response.text)
    return results

In [26]:
# "_search" is key word
uri_search = 'http://localhost:9200/movies/explore/_search'

# get all movies released in year 1991
nineties_movies = search(uri_search, "movie_year", 1991)
format_results(nineties_movies)

JFK
Cape Fear
The Silence of the Lambs


In [27]:
# get all movies of Christoper Nolan
director_movies = search(uri_search, "movie_director", "Christoper Nolan")
format_results(director_movies)

The Dark Knight Rises
The Dark Knight
Insomnia
Memento
Inception
The Prestige


In [28]:
# get all movies of Leonardo DiCaprio
actor_movies = search(uri_search, "movie_hero", "Leonardo DiCaprio")
format_results(actor_movies)

The Revenant
Blood Diamond
Shutter Island
Inception
The Departed


In [29]:
# so we created three function one for creating indexes, one for searching and one another for printing the results, make use of del_indices.sh
# which will help to delete all the contents and start fresh

# todo
# aggregate queries