In [1]:
import pandas as pd

In [21]:
!pip install farm-haystack

from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TfidfRetriever,BM25Retriever
import pandas as pd



In [2]:
data = pd.read_csv('/content/pif_companies_filtered.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        97 non-null     object
 1   Description  97 non-null     object
dtypes: object(2)
memory usage: 1.6+ KB


# using TfidfVectorizer from sklearn


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

data['Description'].fillna('', inplace=True)
data['Description'] = data['Description'].str.lower()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['Description'])

def search_companies(query, top_n=5):
  query_vec = vectorizer.transform([query.lower()]) # query to lower case
  similarity_scores = np.dot(query_vec, tfidf_matrix.T).toarray()[0]
  top_indices = np.argsort(similarity_scores)[::-1][:top_n]
  results = data.iloc[top_indices]
  results['similarity_score'] = similarity_scores[top_indices]
  return results


search_results = search_companies("what is tic", top_n=3)
search_results

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Description'].fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity_score'] = similarity_scores[top_indices]


Unnamed: 0,Title,Description,similarity_score
41,Tahakom Investments Company (TIC),tic is a holding company that owns several sub...,0.32528
6,Kayanee,kayanee is a public investment fund (pif) comp...,0.179599
74,Saudi Electricity Company (SEC),incorporated in accordance with council of min...,0.097491


# using TfidfRetriever from Haystack


In [None]:
documents = []
for index, row in data.iterrows():
    documents.append({"content": row["Description"], "meta": {"company_name": row["Title"], "id": index}})

document_store = InMemoryDocumentStore(use_bm25=True)

document_store.write_documents(documents)

In [33]:
retriever = TfidfRetriever (document_store=document_store)

In [34]:
def search_companies_haystack(query, top_n=3):
    results = retriever.retrieve(query=query, top_k=top_n)
    return results

search_results = search_companies_haystack("when was riyadh air lunched?", top_n=3)
search_results

[<Document: {'content': 'riyadh air is a world-class airline owned by the public investment fund (pif). launched in march 2023 and hubbed in the capital of the kingdom, the airline will be a digitally-led, full service airline that adopts the best global sustainability and safety practices across its advanced fleet of aircraft. riyadh air will equip its aircrafts with the most advanced, state-of-the-art features with innovative, best-in-class cabin interiors and experiences, including next generation digital in-flight entertainment systems and connectivity solutions. riyadh air will connect guests to over 100 destinations around the world by 2030 through offering an exceptional guest experience with an authentic, warm saudi hospitality at its heart.', 'content_type': 'text', 'score': None, 'meta': {'company_name': 'Riyadh Air', 'id': 12}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '6fa8db4ccee9332a67808a7178293cd8'}>,
 <Document: {'content': 'when it comes to offering, seven

In [35]:
search_results[0].content , search_results[0].score

('riyadh air is a world-class airline owned by the public investment fund (pif). launched in march 2023 and hubbed in the capital of the kingdom, the airline will be a digitally-led, full service airline that adopts the best global sustainability and safety practices across its advanced fleet of aircraft. riyadh air will equip its aircrafts with the most advanced, state-of-the-art features with innovative, best-in-class cabin interiors and experiences, including next generation digital in-flight entertainment systems and connectivity solutions. riyadh air will connect guests to over 100 destinations around the world by 2030 through offering an exceptional guest experience with an authentic, warm saudi hospitality at its heart.',
 None)

# using BM25Retriever from hatstack

In [36]:
retriever = BM25Retriever (document_store=document_store)

In [37]:
def search_companies_haystack(query, top_n=3):
    results = retriever.retrieve(query=query, top_k=top_n)
    return results

search_results = search_companies_haystack("when was riyadh air lunched?", top_n=3)
search_results

[<Document: {'content': 'riyadh air is a world-class airline owned by the public investment fund (pif). launched in march 2023 and hubbed in the capital of the kingdom, the airline will be a digitally-led, full service airline that adopts the best global sustainability and safety practices across its advanced fleet of aircraft. riyadh air will equip its aircrafts with the most advanced, state-of-the-art features with innovative, best-in-class cabin interiors and experiences, including next generation digital in-flight entertainment systems and connectivity solutions. riyadh air will connect guests to over 100 destinations around the world by 2030 through offering an exceptional guest experience with an authentic, warm saudi hospitality at its heart.', 'content_type': 'text', 'score': 0.778343414401467, 'meta': {'company_name': 'Riyadh Air', 'id': 12}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '6fa8db4ccee9332a67808a7178293cd8'}>,
 <Document: {'content': "boutique group, a h