In [15]:
from elasticsearch import Elasticsearch

# Connect to the Elasticsearch server
es = Elasticsearch("http://localhost:9200")

# Check if the connection is successful
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")

# Get information about the cluster
info = es.info()
print("Elasticsearch cluster info:", info)


Connected to Elasticsearch
Elasticsearch cluster info: {'name': 'DESKTOP-18UQJ9N', 'cluster_name': 'elasticsearch', 'cluster_uuid': '2VIucQwzRL6EX0wkc-pa0g', 'version': {'number': '7.17.14', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '774e3bfa4d52e2834e4d9d8d669d77e4e5c1017f', 'build_date': '2023-10-05T22:17:33.780167078Z', 'build_snapshot': False, 'lucene_version': '8.11.1', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


In [16]:
from elasticsearch import Elasticsearch

# Connect to your Elasticsearch instance
es = Elasticsearch("http://localhost:9200")

# Define the settings and mappings for the index
index_settings = {
    "settings": {
        "number_of_shards": 1,  # Adjust based on your needs
        "number_of_replicas": 1  # Adjust based on your needs
    },
    "mappings": {
        "properties": {
            "section": {
                "type": "text"
            },
            "content": {
                "type": "text"
            },
            "tags": {
                "type": "keyword"
            },
            "category": {
                "type": "keyword"
            },
            "embedding": {
                "type": "dense_vector",
                "dims": 384  # Adjust this to the size of your embeddings
            }
        }
    }
}

# Name of the index
index_name = "university-info"

# Create the index
response = es.indices.create(index=index_name, body=index_settings)
print(response)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'university-info'}


In [17]:
from elasticsearch import Elasticsearch
import pandas as pd
from sentence_transformers import SentenceTransformer

In [18]:
es = Elasticsearch("http://localhost:9200")

In [19]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [20]:
df = pd.read_csv('career_service_chatbot_queries.csv')

In [21]:
def text_to_embedding(text):
    return model.encode(text).tolist()

In [22]:
def index_data(doc):
    res = es.index(index="university-info", body=doc)
    return res

In [24]:
# Loop through the dataframe and index each row
for index, row in df.iterrows():
    doc = {
        "section": row['section'],
        "content": row['content'],
        "tags": row['tags'].split(', '),  # Assuming tags are separated by commas
        "category": row['category'],
        "embedding": text_to_embedding(row['content'])  # Convert 'Content' into an embedding
    }
    response = index_data(doc)
    print(f"Document indexed, ID: {response['_id']}")



Document indexed, ID: YctLe44B5ZWJZ-lDElnZ
Document indexed, ID: YstLe44B5ZWJZ-lDFlkR
Document indexed, ID: Y8tLe44B5ZWJZ-lDGVnw
Document indexed, ID: ZMtLe44B5ZWJZ-lDHFnY
Document indexed, ID: ZctLe44B5ZWJZ-lDH1lf
Document indexed, ID: ZstLe44B5ZWJZ-lDIVnz
Document indexed, ID: Z8tLe44B5ZWJZ-lDJVkl
Document indexed, ID: aMtLe44B5ZWJZ-lDJ1lj
Document indexed, ID: actLe44B5ZWJZ-lDKllQ
Document indexed, ID: astLe44B5ZWJZ-lDLVm1


In [None]:
#delete index

In [14]:
from elasticsearch import Elasticsearch

# Connect to the Elasticsearch server
es = Elasticsearch("http://localhost:9200")

# The name of the index you want to delete
index_name = "university-info"

# Check if the index exists
if es.indices.exists(index=index_name):
    # Delete the index
    response = es.indices.delete(index=index_name)
    print(f"Index {index_name} deleted successfully.")
else:
    print(f"Index {index_name} does not exist.")


Index university-info deleted successfully.


In [11]:
def get_similar_content(query, index_name="university-info", top_n=3):
    """
    Retrieve top_n similar contents from Elasticsearch based on the query embedding.
    """
    # Convert the query to an embedding
    query_embedding = model.encode(query).tolist()

    # Construct the Elasticsearch query
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_embedding}
            }
        }
    }
    
    # Perform the search
    response = es.search(
        index=index_name,
        body={
            "query": script_query,
            "_source": ["section", "content"],  # Adjust based on fields you want to retrieve
            "size": top_n
        }
    )
    
    # Extract and return results
    results = [(hit['_source']['section'], hit['_source']['content'], hit['_score']) for hit in response['hits']['hits']]
    return results

In [12]:
# Example usage
user_query = "How can I get financial aid?"
similar_contents = get_similar_content(user_query)

In [13]:

# Output the results
for section, content, score in similar_contents:
    print(f"Section: {section}\nContent: {content}\nScore: {score}\n")

Section: Financial Aid
Content: We offer various financial aid options including scholarships, grants, and loans to eligible students.
Score: 1.6357753

Section: Student Services
Content: Our student services include counseling, career advice, and academic support.
Score: 1.3327768

Section: Housing Services
Content: We provide assistance with finding on-campus and off-campus housing options.
Score: 1.207896



In [12]:
from elasticsearch import Elasticsearch

# Connect to your Elasticsearch instance
es = Elasticsearch("http://localhost:9200")

# Get and print the mappings for the university-info index
response = es.indices.get_mapping(index="university-info")
print(response)


{'university-info': {'mappings': {'properties': {'category': {'type': 'keyword'}, 'content': {'type': 'text'}, 'embedding': {'type': 'dense_vector', 'dims': 384}, 'section': {'type': 'text'}, 'tags': {'type': 'keyword'}}}}}


In [18]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

# Initialize the Elasticsearch client
es = Elasticsearch("http://localhost:9200")

# Function for testing the modified script_score query
def get_similar_content_with_fixed_vector(query, index_name="university-info", top_n=3, vector_dimension=384):
    """
    Retrieve top_n similar contents from Elasticsearch based on a fixed vector in the script query.
    """
    # Fixed query vector composed of 1s
    fixed_query_vector = [1] * vector_dimension

    # Construct the Elasticsearch query using the fixed vector
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {
                    "query_vector": fixed_query_vector
                }
            }
        }
    }
    
    # Perform the search with the modified script
    response = es.search(
        index=index_name,
        body={
            "query": script_query,
            "_source": ["section", "content"],  # Adjust based on fields you want to retrieve
            "size": top_n
        }
    )
    
    # Extract and return results
    results = [(hit['_source'].get('section', 'No section'), hit['_source'].get('content', 'No content'), hit['_score']) for hit in response['hits']['hits']]
    return results

# Example usage
user_query = "How can I get financial aid?"  # Example user query
similar_contents = get_similar_content_with_fixed_vector(user_query)  # Get contents using the fixed vector

# Output the results
for section, content, score in similar_contents:
    print(f"Section: {section}\nContent: {content}\nScore: {score}\n")


RequestError: RequestError(400, 'search_phase_execution_exception', 'compile error')