In [157]:
import pandas as pd

# Data in a conversational tone for a chatbot response
data =[
  {
    "Section": "Improving Academic English",
    "Content": "Want to get more comfortable with academic English? Our classes are specially tailored to boost your language and communication skills for the academic challenges ahead. It's all about giving you the support you need to thrive in your English-language studies.",
    "Tags": ["English improvement", "academic support", "language skills"],
    "Category": "Language Services",
    "URL": "https://www.dundee.ac.uk/subjects/english-international-students"
  },
  {
    "Section": "Accessing Language Support",
    "Content": "Dive into the world of English language support with our free EIS (English for International Students) classes. Scan the QR code or visit our website to start your journey to language mastery!",
    "Tags": ["EIS", "free classes", "English language support"],
    "Category": "Language Services",
    "URL": "https://www.dundee.ac.uk/subjects/english-international-students"
  },
  {
    "Section": "Contact for English Support",
    "Content": "Questions about our English language classes? We're just an email away. Reach out to eis@dundee.ac.uk and let's chat about how we can help you with your English.",
    "Tags": ["contact", "EIS", "language help"],
    "Category": "Language Services",
    "URL": "https://www.dundee.ac.uk/subjects/english-international-students"
  }
]


# Create a DataFrame
df = pd.DataFrame(data)

# Define the path for the new CSV file
csv_file_path = 'english.csv'

# Save the DataFrame as a CSV file
df.to_csv(csv_file_path, index=False)

print(f"CSV file saved as {csv_file_path}")


CSV file saved as english.csv


In [2]:
from elasticsearch import Elasticsearch

# Connect to the Elasticsearch server
es = Elasticsearch("http://localhost:9200")

# Check if the connection is successful
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")

# Get information about the cluster
info = es.info()
print("Elasticsearch cluster info:", info)


Connected to Elasticsearch
Elasticsearch cluster info: {'name': 'DESKTOP-18UQJ9N', 'cluster_name': 'elasticsearch', 'cluster_uuid': '2VIucQwzRL6EX0wkc-pa0g', 'version': {'number': '7.17.14', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '774e3bfa4d52e2834e4d9d8d669d77e4e5c1017f', 'build_date': '2023-10-05T22:17:33.780167078Z', 'build_snapshot': False, 'lucene_version': '8.11.1', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}




In [36]:
from elasticsearch import Elasticsearch


es = Elasticsearch("http://localhost:9200")


index_settings = {
    "settings": {
        "number_of_shards": 1, 
        "number_of_replicas": 1  
    },
    "mappings": {
        "properties": {
            "Section": {
                "type": "text"
            },
            "Content": {
                "type": "text"
            },
            "Tags": {
                "type": "text"  
            },
            "Category": {
                "type": "keyword"
            },
            "URL": {
                "type": "keyword"  
            },
            "embedding": {
                "type": "dense_vector",
                "dims": 384  
            }
        }
    }
}

# Name of the index
index_name = "university-info"

# Creating the index
response = es.indices.create(index=index_name, body=index_settings)
print(response)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'university-info'}


In [6]:
from elasticsearch import Elasticsearch
import pandas as pd
from sentence_transformers import SentenceTransformer

In [55]:
es = Elasticsearch("http://localhost:9200")

In [56]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [158]:
df = pd.read_csv(r'english.csv')

In [159]:
df

Unnamed: 0,Section,Content,Tags,Category,URL
0,Improving Academic English,Want to get more comfortable with academic Eng...,"['English improvement', 'academic support', 'l...",Language Services,https://www.dundee.ac.uk/subjects/english-inte...
1,Accessing Language Support,Dive into the world of English language suppor...,"['EIS', 'free classes', 'English language supp...",Language Services,https://www.dundee.ac.uk/subjects/english-inte...
2,Contact for English Support,Questions about our English language classes? ...,"['contact', 'EIS', 'language help']",Language Services,https://www.dundee.ac.uk/subjects/english-inte...


In [77]:
def text_to_embedding(text):
    return model.encode(text).tolist()

In [78]:
def index_data(doc):
    res = es.index(index="university-info", body=doc)
    return res

In [160]:
for index, row in df.iterrows():
    doc = {
        "Section": row['Section'],
        "Content": row['Content'],
        "Tags": row['Tags'].split(', '),  # Assuming tags are separated by commas
        "Category": row['Category'],
        "URL": row['URL'],  # Adding the URL field
        # Assume you have a function text_to_embedding() that converts text to an embedding
        "embedding": text_to_embedding(row['Content'])  # Convert 'Content' into an embedding
    }
    # Assuming index_data is a function that indexes the document in Elasticsearch
    response = index_data(doc)
    print(f"Document indexed, ID: {response['_id']}")




Document indexed, ID: YmW2o44BAPln75ubcjjJ
Document indexed, ID: Y2W2o44BAPln75ubdThf
Document indexed, ID: ZGW2o44BAPln75ubeDg2


In [None]:
#delete index

In [34]:
from elasticsearch import Elasticsearch

# Connect to the Elasticsearch server
es = Elasticsearch("http://localhost:9200")

# The name of the index you want to delete
index_name = "university-info"

# Check if the index exists
if es.indices.exists(index=index_name):
    # Delete the index
    response = es.indices.delete(index=index_name)
    print(f"Index {index_name} deleted successfully.")
else:
    print(f"Index {index_name} does not exist.")


Index university-info deleted successfully.


In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
def get_similar_content(query, index_name="university-info", top_n=3):
    """
    Retrieve top_n similar contents from Elasticsearch based on the query embedding.
    """
    # Convert the query to an embedding
    query_embedding = model.encode(query).tolist()

    # Construct the Elasticsearch query
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_embedding}
            }
        }
    }
    
    # Perform the search
    response = es.search(
        index=index_name,
        body={
            "query": script_query,
            "_source": ["Section", "Content", "URL"],  # Including URL in the fields to retrieve
            "size": top_n
        }
    )
    
    # Extract and return results
    results = [(hit['_source']['Section'], hit['_source']['Content'], hit['_source']['URL'], hit['_score']) for hit in response['hits']['hits']]
    return results


In [9]:
import time

user_query = "I'm an international student; how do I enroll in English language classes?"

start_time = time.time()  
similar_contents = get_similar_content(user_query)
end_time = time.time()  

elapsed_time = end_time - start_time

print(f"Time to find similar content: {elapsed_time:.4f} seconds")
print(similar_contents)  

Time to find similar content: 0.5200 seconds
[('Improving Academic English', "Want to get more comfortable with academic English? Our classes are specially tailored to boost your language and communication skills for the academic challenges ahead. It's all about giving you the support you need to thrive in your English-language studies.", 'https://www.dundee.ac.uk/subjects/english-international-students', 1.6384487), ('Accessing Language Support', 'Dive into the world of English language support with our free EIS (English for International Students) classes. Scan the QR code or visit our website to start your journey to language mastery!', 'https://www.dundee.ac.uk/subjects/english-international-students', 1.5807527), ('Contact for English Support', "Questions about our English language classes? We're just an email away. Reach out to eis@dundee.ac.uk and let's chat about how we can help you with your English.", 'https://www.dundee.ac.uk/subjects/english-international-students', 1.45523



In [166]:

# Output the results
for section, content, url, score in similar_contents:
    print(f"Section: {section}\nContent: {content}\nURL: {url}\nScore: {score}\n")

Section: Improving Academic English
Content: Want to get more comfortable with academic English? Our classes are specially tailored to boost your language and communication skills for the academic challenges ahead. It's all about giving you the support you need to thrive in your English-language studies.
URL: https://www.dundee.ac.uk/subjects/english-international-students
Score: 1.6384487

Section: Accessing Language Support
Content: Dive into the world of English language support with our free EIS (English for International Students) classes. Scan the QR code or visit our website to start your journey to language mastery!
URL: https://www.dundee.ac.uk/subjects/english-international-students
Score: 1.5807527

Section: Contact for English Support
Content: Questions about our English language classes? We're just an email away. Reach out to eis@dundee.ac.uk and let's chat about how we can help you with your English.
URL: https://www.dundee.ac.uk/subjects/english-international-students
Sc

In [12]:
from elasticsearch import Elasticsearch

# Connect to your Elasticsearch instance
es = Elasticsearch("http://localhost:9200")

# Get and print the mappings for the university-info index
response = es.indices.get_mapping(index="university-info")
print(response)


{'university-info': {'mappings': {'properties': {'category': {'type': 'keyword'}, 'content': {'type': 'text'}, 'embedding': {'type': 'dense_vector', 'dims': 384}, 'section': {'type': 'text'}, 'tags': {'type': 'keyword'}}}}}


In [18]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

# Initialize the Elasticsearch client
es = Elasticsearch("http://localhost:9200")

# Function for testing the modified script_score query
def get_similar_content_with_fixed_vector(query, index_name="university-info", top_n=3, vector_dimension=384):
    """
    Retrieve top_n similar contents from Elasticsearch based on a fixed vector in the script query.
    """
    # Fixed query vector composed of 1s
    fixed_query_vector = [1] * vector_dimension

    # Construct the Elasticsearch query using the fixed vector
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {
                    "query_vector": fixed_query_vector
                }
            }
        }
    }
    
    # Perform the search with the modified script
    response = es.search(
        index=index_name,
        body={
            "query": script_query,
            "_source": ["section", "content"],  # Adjust based on fields you want to retrieve
            "size": top_n
        }
    )
    
    # Extract and return results
    results = [(hit['_source'].get('section', 'No section'), hit['_source'].get('content', 'No content'), hit['_score']) for hit in response['hits']['hits']]
    return results

# Example usage
user_query = "How can I get financial aid?"  # Example user query
similar_contents = get_similar_content_with_fixed_vector(user_query)  # Get contents using the fixed vector

# Output the results
for section, content, score in similar_contents:
    print(f"Section: {section}\nContent: {content}\nScore: {score}\n")


RequestError: RequestError(400, 'search_phase_execution_exception', 'compile error')

In [100]:
#replace queryy

In [106]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
# es = Elasticsearch()

# Define the index and query
# index_name = "your_index_name"
query = {
    "query": {
        "match": {
            "Section": "Debt Support Services"
        }
    }
}

# Search for documents
search_results = es.search(index=index_name, body=query)

# Iterate through search results
for hit in search_results['hits']['hits']:
    doc_id = hit['_id']
    content = hit['_source']['Content']
    section = hit['_source']['Section']
    
    # Update the URL to the new URL
    new_url = 'https://www.dundee.ac.uk/student-funding'
    es.update(index=index_name, id=doc_id, body={"doc": {"URL": new_url}})
    print(f"URL in document {doc_id} updated.")

print("All matching documents updated.")


URL in document UmUlo44BAPln75ubdzgy updated.
URL in document U2Ulo44BAPln75ubezg3 updated.
URL in document VWU1o44BAPln75ubTTjv updated.
URL in document V2U1o44BAPln75ubVTgb updated.
All matching documents updated.
