In [1]:
from txtai import Embeddings
import os
import pandas as pd
import re

In [2]:
# Define constants
DATA_DIR = '../datasets'
FILE_NAME = 'Articles.csv'
EMBEDDINGS_PATH = './text_embeddings'
EMBEDDINGS_MODEL = 'sentence-transformers/nli-mpnet-base-v2'

In [3]:
# Load the data
if os.path.isfile(os.path.join(DATA_DIR, FILE_NAME)):
    df = pd.read_csv(os.path.join(DATA_DIR, FILE_NAME), encoding='latin1')

df.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [4]:
# Data pre-processing
data = df['Article'].tolist()
data = [re.split(": *", text, 1)[1] for text in data if ":" in text]
data[:5]

['The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling.Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.                        \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n',
 'Asian markets started 2015 on an upswing in limited trading on Friday, with mainland Chinese stocks surging in Hong Kong on speculation Beijing may ease monetary policy to boost slowing growth.Hong Kong rose 1.07 percent, closing 252.78 points higher at

In [5]:
# Load the embeddings model
embeddings = Embeddings({"path": EMBEDDINGS_MODEL, "content": True})

In [6]:
# Index the data into the embeddings model
embeddings.index(data)

In [7]:
# Save the embeddings model
embeddings.save(EMBEDDINGS_PATH)

In [8]:
# Load the embeddings
embeddings.load(EMBEDDINGS_PATH)

In [9]:
# Run an embeddings search for each query 
query = "Science and technology"
num_results = 5
results = embeddings.search(query, num_results)
results

[{'id': '791',
  'text': 'Scientists from India, the United States and Japan have struck upon a large natural gas deposit in the Bay of Bengal, the first potentially producible discovery of its kind in the Indian Ocean.</strongA research expedition carried out jointly by the three countries discovered the natural gas hydrate -- an ice-like form of the fuel -- off India´s east coast, the United States Geological Survey (USGS) said in a statement Monday.Energy-hungry India is heavily dependent on imports to meet its oil and gas needs and is hungry to secure more of its own supplies."The results from this expedition mark a critical step forward to understanding the energy resource potential of gas hydrates," said USGS Senior Scientist Tim Collett.The discovery in the sand reservoirs of the Krishna-Godavari Basin contains "what we believe to be several of the largest and most concentrated gas hydrate accumulations yet found in the world," Collett said.The amount of natural gas locked in hy