In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
file_path = '../data/pif_companies_filtered.csv'
df = pd.read_csv(file_path)

# Function to clean text by removing special characters
def clean_text(text):
    if isinstance(text, str):
        return re.sub(r'[^\w\s]', '', text).strip()
    return text

# Basic stopwords removal (no NLTK required)
basic_stopwords = set([
    "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", 
    "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", 
    "to", "was", "were", "will", "with"
]) - {'company', 'group', 'investment', 'bank', 'development', 'saudi', 'international'}

def process_text_no_nltk(text):
    if isinstance(text, str):
        words = text.split()
        words = [word for word in words if word not in basic_stopwords]
        return ' '.join(words)
    return text

# Clean and preprocess Title and Description
df['Title'] = df['Title'].str.lower().apply(clean_text).apply(process_text_no_nltk)
df['Description'] = df['Description'].str.lower().apply(clean_text).apply(process_text_no_nltk)

# TF-IDF Vectorizer and Matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Description'])

# Query Function
def search_companies(query, top_n=5):
    query_vec = vectorizer.transform([query.lower()])
    similarity_scores = np.dot(query_vec, tfidf_matrix.T).toarray()[0]
    top_indices = np.argsort(similarity_scores)[::-1][:top_n]
    results = df.iloc[top_indices].copy()
    results['similarity_score'] = similarity_scores[top_indices]
    return results



                              Title  \
3       tasaru mobility investments   
35                             jada   
48  saudi jordanian investment fund   

                                          Description  similarity_score  
3   tasaru mobility investments investment gateway...          0.396096  
35  jada created promote development thriving priv...          0.115275  
48  saudi jordanian investment fund sjif limited p...          0.104289  


In [5]:
# Example Query
search_results = search_companies("WHAT IS ARDARA", top_n=3)
print(search_results)

                            Title  \
2                          ardara   
6                         kayanee   
74  saudi electricity company sec   

                                          Description  similarity_score  
2   ardara real estate developer owned public inve...          0.236493  
6   kayanee public investment fund pif company aim...          0.190991  
74  incorporated accordance council ministers mand...          0.107194  
