# Demo : ETL Elasticsearch (Transform-Load)

Import libraries. At this point, you should already runs previous lessons and have the libraries installed.

In [1]:
import pandas as pd
import psycopg2
from elasticsearch import Elasticsearch
from pprint import pprint

Open connection to postgresql

In [2]:
try:
    conn = psycopg2.connect("host=34.101.229.192 dbname=postgres user=postgres password=CourseDE888")
    conn.set_session(autocommit=True)
    cur = conn.cursor()
except Exception as e:
    print("Error: cannot open cursor for SQL interaction")
    print(e)

Open connection to elasticsearch

In [3]:
try:
    es = Elasticsearch(
            hosts=["https://data-engineering.es.asia-southeast1.gcp.elastic-cloud.com:9243"], 
            http_auth=("elastic", "piBb8K7c0ATizq5R725uRy8N")
         )
except Exception as e:
    print("Error: cannot open cursor for Elasticsearch interaction")
    print(e)

Populate data for staging tables

In [4]:
sql_movie_principals = "SELECT * FROM staging.stg_es_movie_principals"

In [5]:
df_movie_principals = pd.read_sql(sql_movie_principals, conn)

Index name will be used later.

In [6]:
es_index_name = "movie-principals"

Optionally, create index with manual setting. 

In [7]:
es_index_body = """
{
  "settings": {
    "number_of_shards": 3,
    "number_of_replicas": 2
  }
}
"""

try:
    es.indices.create(es_index_name, body=es_index_body)
except Exception as e:
    print(e)

For the demo, just take 10,000 data

In [8]:
df_movie_principals = df_movie_principals.sample(10000)

Note that we might have data like this from staging.

| title_id  | name_id  | category  | job  | character  |
|---|---|---|---|---|
| title01  | name99  | AAA  |   |   |
| title01  | name99  |   | BBB  |   |
| title01  | name99  |   | CCC  |   |
| title01  | name99  |   |   | DDD  |

So the only transformation we need, is aggregate such data into one row. On example above, column `Job` also joined by comma, so we have

| title_id  | name_id  | category  | job  | character  |
|---|---|---|---|---|
| title01  | name99  | AAA  | BBB, CCC  | DDD  |


In [9]:
f_agg = lambda x: ', '.join(x.dropna().unique())
df_movie_principals = df_movie_principals.groupby(['title_id','name_id'], as_index=False).agg(f_agg)

See the aggregated sample.

In [10]:
df_movie_principals.sample(5)

Unnamed: 0,title_id,name_id,title,original_title,production_company,description,name,category,job,character
5665,tt0467110,nm0227759,Underdog - Storia di un vero supereroe,Underdog,Have No Fear Productions,A Beagle must use his newly-bestowed superpowe...,Peter Dinklage,actor,,
6853,tt1450320,nm0752117,Días de gracia,Días de gracia,ARP Sélection,"Mexico City. 2002, 2006, 2010. A cop. A hostag...",David Rutsala,,additional writing,
2822,tt0094169,nm0322946,I duri non ballano,Tough Guys Don't Dance,Golan-Globus Productions,"Writer, ex-con and 40-something bottle-baby Ti...",Yoram Globus,producer,,
8622,tt4183372,nm5667278,De Onde Eu Te Vejo,De Onde Eu Te Vejo,Bossa Nova Films,"After 20 years of marriage, Ana Lúcia and her ...",Leonardo Moreira,writer,,
5240,tt0382810,nm0675476,Little Fish - Fuga dall'incubo,Little Fish,Porchlight Films,Set in the Little Saigon district outside of S...,Jacquelin Perske,,written by,


Load aggregated data into Elasticsearch, using our own id.

In [11]:
for idx, rec in df_movie_principals.iterrows():
    key = "{}-{}".format(rec["title_id"], rec["name_id"].lower().replace(" ", ""))
    es.index(es_index_name, rec.to_json(), id=key)

Simple query demo

In [13]:
search_body = '''
{
    "query": {
        "match": {
            "title": "love"
        }
    }
}
'''

res = es.search(index=es_index_name, body=search_body)
pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 3, 'total': 3},
 'hits': {'hits': [{'_id': 'tt0392360-nm0006795',
                    '_index': 'movie-principals',
                    '_score': 7.18776,
                    '_source': {'category': '',
                                'character': 'Prithvi',
                                'description': 'Prithvi and Maggie fall in '
                                               'love and decide to get '
                                               "married, but Maggie's mother "
                                               "objects looking at Prithvi's "
                                               'past.',
                                'job': '',
                                'name': 'Salman Khan',
                                'name_id': 'nm0006795',
                                'original_title': 'Love',
                                'production_company': 'Unknown',
                                'title': 'Love