In [1]:
from elasticsearch import Elasticsearch
import json
import os

In [2]:
with open("config.json", 'r') as config_file:
    config = json.load(config_file)
    INDEX_PORT = config['port']
    INDEX_HOST = config['host']
    INDEX_USER = config['username']
    INDEX_PASS = config['psw']
    INDEX_NAME = config['surname']
    INDEX_URL = 'http://{}:{}/'.format(INDEX_HOST, INDEX_PORT)

In [3]:
def index_create():
    es = Elasticsearch(INDEX_URL, http_auth=(INDEX_USER, INDEX_PASS))
    if es.indices.exists(index=INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)
    es.indices.create(index=INDEX_NAME)
    return es

In [4]:
def insert_text_examples():
    docs = ["Trump u.s.a. NATO", "trump usa N.A.T.O.", "the cat sleeps"]
    for line in docs:
        document = {'line_content': line.strip()}
        es.index(index=INDEX_NAME, body=document)

In [5]:
es = index_create()
insert_text_examples()

In [6]:
input_query = input('Insert a query: ').strip()
query_body = {'query': {'match': {'line_content': input_query}}}

res = es.search(index=INDEX_NAME, body=query_body)
for hit in res['hits']['hits']:
    print('score: {} - line: {}'.format(hit['_score'], hit['_source']['line_content']))

Insert a query: trump
score: 0.4700036 - line: Trump u.s.a. NATO
score: 0.4700036 - line: trump usa N.A.T.O.


In [7]:
def example_queries():
    queries = ["She is sleeping", "I am sleeping", "I live in the u.s.a.", "TRUMP"]
    for query in queries:
        query_body = {'query': {'match': {'line_content': query.strip()}}}

        res = es.search(index=INDEX_NAME, body=query_body)
        print("QUERY \"{}\":".format(query))
        for hit in res['hits']['hits']:
            print('score: {} - line: {}'.format(hit['_score'], hit['_source']['line_content']))
        print("================================================================================")
            
example_queries()

QUERY "She is sleeping":
QUERY "I am sleeping":
QUERY "I live in the u.s.a.":
score: 0.9808291 - line: Trump u.s.a. NATO
score: 0.9808291 - line: the cat sleeps
QUERY "TRUMP":
score: 0.4700036 - line: Trump u.s.a. NATO
score: 0.4700036 - line: trump usa N.A.T.O.


## Second Part

In [8]:
es = index_create()
mapping =  {
    "properties": { 
        "line_content": {
            "type": "text",
            "analyzer": "english"
        }      
    }    
}
es.indices.put_mapping(index=INDEX_NAME, body=mapping)

{'acknowledged': True}

In [9]:
es = index_create()
insert_text_examples()

In [10]:
input_query = input('Insert a query: ').strip()
query_body = {'query': {'match': {'line_content': input_query}}}

res = es.search(index=INDEX_NAME, body=query_body)
for hit in res['hits']['hits']:
    print('score: {} - line: {}'.format(hit['_score'], hit['_source']['line_content']))

Insert a query: trump
score: 0.4700036 - line: trump usa N.A.T.O.
score: 0.4700036 - line: Trump u.s.a. NATO


In [11]:
example_queries()

QUERY "She is sleeping":
QUERY "I am sleeping":
QUERY "I live in the u.s.a.":
score: 0.9808291 - line: Trump u.s.a. NATO
score: 0.9808291 - line: the cat sleeps
QUERY "TRUMP":
score: 0.4700036 - line: trump usa N.A.T.O.
score: 0.4700036 - line: Trump u.s.a. NATO


## Third Part

In [12]:
es = index_create()
mapping = {
    "properties":{
        "maintext": {
            "type": "text",
            "analyzer": "english"
        },
        "source": {
            "type": "text",
            "analyzer": "whitespace"
        }      
    }        
}
es.indices.put_mapping(index=INDEX_NAME, body=mapping)

{'acknowledged': True}

In [13]:
dir = "texts"
for filename in os.listdir(dir):
    f = os.path.join(dir, filename)
    with open(f, 'r') as article_file:
        text = json.load(article_file)
        document = {"maintext": text["maintext"], "source": text["source"]}
        es.index(index=INDEX_NAME, body=document)

unique sources: "The New York Times", "The Herald-ir"
some words: "Leclerc", "leclerc", "the", "aircraft"

In [14]:
source = input("Insert a news source: ").strip()
terms = input("Insert text terms: ").strip()
query_body = {
    "query": {
        "bool": {
            "should": [{"match": {"article_body": terms}}, {"match": {"source" : source}}]
            }      
        }        
    }
res = es.search(index=INDEX_NAME, body=query_body)
print ("Found {} results.".format(res['hits']['total']['value']))
for hit in res['hits']['hits']:
    print("=====================================================================")
    print ("score: {} source: {}".format(hit["_score"], hit["_source"]["source"]))
    print ("body: {}".format(hit["_source"]["maintext"])[:100])

Insert a news source: The New York Times
Insert text terms: Leclerc
Found 5 results.
score: 3.3360603 source: The New York Times
body: The revival of supersonic passenger travel, thought to be long dead with the demise of Concord
score: 0.09337806 source: The Herald-ir
body: Antonio Conte. Pic: PA
Head coach Antonio Conte does not think Chelsea are in the race to sign
score: 0.09337806 source: The Herald-ir
body: Hamid Sanambar
Gardai are hunting for a gunman who opened fire on a car in north Dublin - just
score: 0.09337806 source: The Herald-ir
body: Luke O'Reilly with his mother Janet O'Brien Luke O'Reilly Jack Hall Ellis The Metro One Bar in
score: 0.09337806 source: The Herald-ir
body: Charles Leclerc
Charles Leclerc registered the maiden win of his Formula One career after romp


## Fourth Part

In [15]:
es = index_create()
mapping = {
    "properties":{
        "maintext": {
            "type": "text",
            "analyzer": "english"
        },
        "source": {
            "type": "text",
            "analyzer": "whitespace"
        },
        "pub-date": {
            "type": "date",
             "format": "yyyy-MM-dd"
        }
    }        
}
es.indices.put_mapping(index=INDEX_NAME, body=mapping)

{'acknowledged': True}

In [16]:
dir = "texts"
for filename in os.listdir(dir):
    f = os.path.join(dir, filename)
    with open(f, 'r') as article_file:
        text = json.load(article_file)
        document = {"maintext": text["maintext"], "source": text["source"], "pub-date": text["date"]}
        es.index(index=INDEX_NAME, body=document)

In [17]:
source = input("Insert a news source: ").strip()
terms = input("Insert text terms: ").strip()
query_body = {
    "query": {
        "bool": {
            "should": [{"match": {"maintext": terms}}, {"match": {"source": source}}],
            "minimum_should_match": 1,
            "must": [{"range": {"pub-date": {"gt":"2022-01-01"}}}]
        }      
    }        
}

res = es.search(index=INDEX_NAME, body=query_body)
print ("Found {} results.".format(res['hits']['total']['value']))
for hit in res['hits']['hits']:
    print ("score: {} source: {}".format(hit["_score"], hit["_source"]["source"]))
    print ("body: {}".format(hit["_source"]["maintext"])[:100])

Insert a news source: The New York Times
Insert text terms: 
Found 1 results.
score: 4.3360605 source: The New York Times
body: The revival of supersonic passenger travel, thought to be long dead with the demise of Concord
