In [None]:
!pip install elasticsearch

In [None]:
from elasticsearch import helpers
import pandas as pd
from elasticsearch import Elasticsearch, RequestsHttpConnection

EXCLUDED_WORDS = ['limited', 'private', 'and', 'ltd', 'pvt', '&', 'company', 'ltd.', 'pvt.', 'pvt.ltd.', 'limited.',
                  'co', 'limite', 'a', 'of', 'ed', 'limi', 'lim', 'lmtd', 'the']

es = Elasticsearch([{'host': 'localhost'}], http_compress=True, timeout=60)

In [None]:
body = {
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "char_filter": [
            "space_removal"
          ],
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      },
      "char_filter": {
        "space_removal": {
          "type": "pattern_replace",
          "pattern": "\\s+",
          "replacement": ""
        }
      }
    }
  },
  "mappings": {
      "properties": {
        "keyword_2": {
          "type": "text",
          "fields": {
            "variation": {
              "type": "text",
              "analyzer": "my_custom_analyzer"
                }
              }
            },
             'entry_id': {'type': 'long'},
            'keyword_1': {'type': 'text',
             'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
            'keyword_3': {'type': 'text',
             'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
            'keyword_4': {'type': 'text',
             'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}
      }
    }
}

# creating index
es.indices.create(index="match_keywords", ignore=400, body=body)

In [None]:
COLUMNS = ['entry_id','keyword_1','keyword_2','keyword_3','keyword_4','status']

def filterKeys(document):
    return {key: document[key] for key in COLUMNS}

def doc_generator(df):
    df_iter = df.iterrows()
    for index, document in df_iter:
        yield {
                "_index": 'match_keywords',
                "_type": "_doc",
                "_id" :f"{document['entry_id']}",
                "_source": filterKeys(document),
            }
    raise TypeError

In [None]:
import psycopg2

DB = {'database': "***", 'user': "***", 'password': "***",'port': "5432", 'host': "***"}


def get_data_offset(query,connection,offset):
    limit = 1000 #should be same as offset increment
    temp_df=pd.DataFrame()
    query_with_offset = query + " offset " + str(offset) + " limit "+ str(limit)+";"
    try:
        temp_df = pd.read_sql(query_with_offset, connection)
    except:
        print("Error")
        
        temp_df = pd.read_sql(query_with_offset, connection)
    return temp_df

query ="""select 
        entry_id,keyword_1,keyword_2,keyword_3,keyword_4,status 
        from table 
        order by entry_id"""

# uploading bulk data to es from psql

counter = 1
offset =0
while counter>0:
    
    try:
        df = get_data_offset(query,conn,offset)
    except:
        conn = psycopg2.connect(database=DB['database'], user=DB['user'],
                        password=DB['password'], host=DB['host'],
                        port=DB['port'])
        df = get_data_offset(query,conn,offset)
        print("Connecting Again")
    if(len(df))==0:
        counter=0
    try:
        helpers.bulk(es, doc_generator(df))
    except TypeError:
        offset += 1000
        pass
    print(offset)    

print("Indexing done")

In [None]:
from pprint import pprint

def get_top_match_inactive(emp_name):
    emp_details = {}
    emp = " ".join([elem for elem in emp_name.lower().split() if elem not in EXCLUDED_WORDS])
    if (len(emp) > 0):
        emp_first_name = emp.split()[0]
        emp_details = es.search(index="match_keywords", body = {
                                    "size": 4,
                                      "query": {
                                        "bool": {

                                            "must_not":
                                                {
                                                  "term": {
                                                    "status": "active"
                                                  }
                                                },

                                          "filter": {
                                                "term": {
                                                    "keyword_4": emp_first_name[0]
                                                }
                                            },
                                          "should": [
                                            {
                                              "match": {
                                                "keyword_2": emp
                                              }
                                            },                                            
                                            {
                                              "match": {
                                                "keyword_2.variation": {
                                                  "query": emp,
                                                  "fuzziness": "AUTO"
                                                }
                                              }
                                            }
                                          ]
                                        }
                                      }
                                    }
                               )
    return emp_details

def get_top_match_active(emp_name):
    emp_details = {}
    emp = " ".join([elem for elem in emp_name.lower().split() if elem not in EXCLUDED_WORDS])
    if (len(emp) > 0):
        emp_first_name = emp.split()[0]
        emp_details = es.search(index="match_keywords", body = {
                                    "size": 4,
                                      "query": {
                                        "bool": {
                                          "filter": {
                                                "term": {
                                                    "keyword_4": emp_first_name[0]
                                                },
                                                "term": {
                                                    "status": "active"
                                                }
                                            },
                                          "should": [
                                            {
                                              "match": {
                                                "keyword_2": emp
                                              }
                                            },  
                                            {
                                              "match": {
                                                "keyword_2.variation": {
                                                  "query": emp,
                                                  "fuzziness": "AUTO",
                                                    "operator": "and"
                                                }
                                              }
                                            }
                                          ]
                                        }
                                      }
                                    }
                               )
    return emp_details

In [None]:
def get_suggestions(emp_name):
    emp_details = {}
    emp = " ".join([elem for elem in emp_name.lower().split() if elem not in EXCLUDED_WORDS])
    if (len(emp) > 0):
        emp_first_name = emp.split()[0]
#         print(emp, emp_first_name)
        emp_details = es.search(index="match_keywords", 
                                body = {
                                        "query" : {
                                             "match" : {
                                               "keyword_2" : emp
                                             }
                                           },
                                       "suggest" : {
                                         "term_suggester" : {
                                           "text" : emp,
                                           "term" : {
                                             "field" : "keyword_2",
                                             "max_errors" : 2,
                                               "collate": {
                                                   "query": {
                                                     "inline": {
                                                       "match_phrase": {
                                                         "{{field_name}}": {
                                                           "query": "{{suggestion}}",
                                                           "slop" : 1
                                                         }
                                                       }
                                                     }
                                                   },
                                                   "params": {
                                                     "field_name": "keyword_2"
                                                   },
                                                   "prune": True
                                           }
                                         }
                                       }
                                     }
                                  }
                               )
    return emp_details

In [None]:
def get_connections(emp_name):
    emp_details = {}
    emp = " ".join([elem for elem in emp_name.lower().split() if elem not in EXCLUDED_WORDS])
    if (len(emp) > 0):
        emp_names = emp.split()
        should_cond_text = f"({emp_names[0]})"
        for name in emp_names[1:]:
            should_cond_text = f"should_cond_text AND ({name})"
        should_cond = [
                  {
                      "match": {
                        "keyword_2": should_cond_text
                      }
                  },
                  {
                      "match": {
                    "keyword_2.variation": {
                      "query": should_cond_text,
                      "fuzziness": "AUTO"
                    }
                    }
                  },
                {
                  "match": {
                    "keyword_2": emp
                  }
                }
            ]
        body = {
                "size": 4,
                  "query": {
                    "bool": {
                      "should": should_cond
                    }
                  }
                }
#         pprint(body)
        emp_details = es.search(index="match_keywords", body = body)
    return emp_details

In [None]:
# comparing 4 variations

get_top_match_inactive("tata lockheedmartin")
get_top_match_active("tata lockheedmartin")
get_suggestions("tata lockheedmartin")
get_connections("tata lockheedmartin")

In [None]:
es.indices.get_mapping(index="match_keywords")

In [None]:
res = es.indices.analyze(index="match_keywords", body = {
  "field": "text.my_custom_analyzer",
  "text": "tata lock heed martin "
})
for i in res['tokens']:
    print(i['token'])