Preprocess

In [5]:
import re
import pandas as pd

def preprocess(text):
    """
    Clean and preprocess text.
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def load_data_from_csv(file_path):
    """Load and preprocess data from a CSV file."""
    df = pd.read_csv(file_path)
    return [preprocess(str(line)) for line in df["Data"]]

if __name__ == "__main__":
    # Example usage
    data = load_data_from_csv("../data/data.csv")
    print(*data[:5], sep='\n')


4 chapter 1 introduction 11 what operating systems do we begin our discussion by looking at the operating systems role in the overallcomputersystemacomputersystemcanbedividedroughlyintofour components the hardware theoperating system theapplication programs and a userfigure11
the hardware the central processing unit  cpu the memory and the inputoutput  io devicesprovides the basic computing resources for the system
the application programs such as word processors spreadsheets compilers and web browsersdefine the ways in which these resources are used to solve users computing problems
the operating system controls the hardwareandcoordinatesitsuseamongthevariousapplicationprogramsfor thevarious users
we can also view a computer system as consisting of hardware software and data


Embed

In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np

def generate_embeddings(sentences, model_name='all-MiniLM-L6-v2'):
    """
    Generate embeddings for a list of sentences.
    """
    model = SentenceTransformer(model_name, device='cpu')
    embeddings = model.encode(sentences)
    return embeddings

if __name__ == "__main__":

    # Load preprocessed data
    data = load_data_from_csv("../data/data.csv")
    embeddings = generate_embeddings(data)

    # Save embeddings to a file
    np.save("../embeddings/embeddings.npy", embeddings)
    print("Embeddings saved.")


Embeddings saved.


FAISS Index

In [11]:
import faiss
import numpy as np

def build_faiss_index(embeddings):
    """
    Build a FAISS index from embeddings and save it.
    """
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    faiss.write_index(index, "../embeddings/faiss_index")
    return index

def search_faiss(query_embedding, index, top_k=5):
    """
    Search the FAISS index for the top-k nearest neighbors.
    """
    distances, indices = index.search(query_embedding, top_k)
    return indices[0], distances[0]

if __name__ == "__main__":
    # Load embeddings
    embeddings = np.load("../embeddings/embeddings.npy")

    # Build FAISS index
    index = build_faiss_index(embeddings)
    print("FAISS index built and saved.")


FAISS index built and saved.


A* Search

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def g_score(query_embedding, node_embedding):
    """
    Calculate the similarity (g-score) between query and node.
    """
    return cosine_similarity([query_embedding], [node_embedding])[0][0]

def h_score(query_embedding, node_embedding):
    """
    Calculate heuristic score (h-score) as inverse similarity.
    """
    return 1 - g_score(query_embedding, node_embedding)

def a_star_search(query_embedding, embeddings, sentences, top_k=3):
    """
    Perform A* search for top-k relevant results.
    """
    import heapq

    open_set = []
    results = []

    for idx, node_embedding in enumerate(embeddings):
        g = g_score(query_embedding, node_embedding)
        h = h_score(query_embedding, node_embedding)
        f = g + h
        heapq.heappush(open_set, (-f, idx))

    for _ in range(top_k):
        if not open_set:
            break
        _, idx = heapq.heappop(open_set)
        results.append(sentences[idx])

    return results


Main Script

In [19]:
import numpy as np
from sentence_transformers import SentenceTransformer

def retreive(query):
    # Load sentences and embeddings
    sentences = load_data_from_csv("../data/data.csv")
    embeddings = np.load("../embeddings/embeddings.npy")

    # Load FAISS index
    index = faiss.read_index("../embeddings/faiss_index")

    # Query for search
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    query_embedding = model.encode([query])

    # FAISS Search
    indices, _ = search_faiss(query_embedding, index, top_k=10)
    candidate_embeddings = embeddings[indices]
    candidate_sentences = [sentences[i] for i in indices]

    # A* Search
    results = a_star_search(query_embedding[0], candidate_embeddings, candidate_sentences, top_k=3)
    
    return results
    # Output results
    

if __name__ == "__main__":
    
    query = "What is the function of an operating system?"
    print("\nQuery:", query)
    print("\nTop Results:")
    for idx, result in enumerate(retreive( query )):
        print(f"{idx + 1}. {result}")
    
    



Query: What is the function of an operating system?

Top Results:
1. an operating system is a control program
2. 113 deﬁning operating systems by now you can probably see that the term operating system covers many roles and functions
3. the operating system provides the means for proper use of these resources in the operation of the computer system


LLM

In [30]:
from transformers import pipeline

def generate_content_with_llm(content):
    """
    Generate a paragraph from retrieved content using Hugging Face GPT-2.
    """
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")
    prompt = (
        "Given the following sentences, construct a cohesive and well-structured paragraph:\n\n"
        f"{content}\n\n"
        "Construct the paragraph:"
    )
    response = generator(prompt, max_length=200, num_return_sequences=1, temperature=0.7)
    return response[0]["generated_text"]


In [31]:
generate_content_with_llm("The hardware includes the CPU, memory, and I/O devices. The operating system coordinates their use.")

AttributeError: 'FloatProgress' object has no attribute 'style'

In [34]:

def main():
    query = input("Enter a Query : ")
    data = retreive(query)
    
    data = ". ".join(data)
    
    print(data)
    
#     print( generate_content_with_llm(data) )
    

if __name__ == '__main__':
    main()

Enter a Query : Os
an operating system is a control program. finally we describe how operating systems arecreatedand how a computerstartsits operatingsystem. 4 chapter 1 introduction 11 what operating systems do we begin our discussion by looking at the operating systems role in the overallcomputersystemacomputersystemcanbedividedroughlyintofour components the hardware theoperating system theapplication programs and a userfigure11
