In [6]:
import torch
import pandas as pd
import re
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
pip install fastText

Collecting fastText
  Using cached fasttext-0.9.2.tar.gz (68 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Using legacy 'setup.py install' for fastText, since package 'wheel' is not installed.
Installing collected packages: fastText
  Running setup.py install for fastText: started
  Running setup.py install for fastText: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Running setup.py install for fastText did not run successfully.
  exit code: 1
  
  [20 lines of output]
  running install
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-3.10
  creating build\lib.win-amd64-3.10\fasttext
  copying python\fasttext_module\fasttext\FastText.py -> build\lib.win-amd64-3.10\fasttext
  copying python\fasttext_module\fasttext\__init__.py -> build\lib.win-amd64-3.10\fasttext
  creating build\lib.win-amd64-3.10\fasttext\util
  copying python\fasttext_module\fasttext\util\util.py -> build\lib.win-amd64-3.10\fasttext\util
  copying python\fasttext_module\fasttext\util\__init__.py -> build\lib.win-amd64-3.10\fasttext\util
  creating build\lib.win-amd64-3.10\fasttext\tests
  copying python\fasttext_module\fasttext\tests\test_configurations.py -> build\lib.win-amd64-3.10\fasttext\tests
  copying python\fasttext_module\fasttext\tests\test_script.py -> build\lib.win-amd64-3.10\fasttext\tests


In [2]:
torch.cuda.is_available()

True

In [3]:
df = pd.read_csv("Data/preprocessed_courses.csv")

In [12]:
import torch
from transformers import AutoTokenizer
import fastText

# Define functions for preprocessing, embedding generation, and semantic search

def preprocess_text(text):
    # Replace with your specific preprocessing steps as needed
    text = text.lower()  # Lowercase
    text = text.strip()  # Remove leading/trailing whitespaces
    tokens = text.split()  # Tokenize
    return tokens

def generate_fasttext_embeddings(text_data, model_path):
    """
    Generates FastText embeddings for a list of text data.

    Args:
        text_data (list): A list of strings, each representing a piece of text.
        model_path (str): The path to the pre-trained FastText model file.

    Returns:
        list: A list of FastText word vectors, corresponding to the input text data.
    """

    model = fasttext.load_model(model_path)
    embeddings = []
    for text in text_data:
        tokens = preprocess_text(text)
        if len(tokens) > 0:  # Address potential empty tokens
            text_vector = model.get_sentence_vector(tokens)  # Consider using `get_sentence_vector` for improved sentence embeddings
        else:
            text_vector = model.get_word_vector("<PAD>")  # Use padding vector for empty sentences
        embeddings.append(text_vector)
    return embeddings

def perform_semantic_search(query, text_data, sbert_model_name, fasttext_embeddings=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available

    model = SentenceTransformer(sbert_model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(sbert_model_name)

    # Preprocess and encode the query sentence
    query_tokens = preprocess_text(query)
    query_embedding = model.encode(tokenizer(query_tokens, return_tensors="pt", padding="max_length", truncation=True).to(device))[0]

    # If FastText embeddings are provided, use them for combined search
    if fasttext_embeddings:
        combined_embeddings = [(embedding + fasttext_embedding) / 2 for embedding, fasttext_embedding in zip(model.encode(text_data, return_tensors="pt").to(device), fasttext_embeddings)]
    else:
        combined_embeddings = model.encode(text_data, return_tensors="pt").to(device)

    # Calculate cosine similarities between the query and each document
    similarities = torch.nn.functional.cosine_similarity(query_embedding.unsqueeze(0), combined_embeddings)
    top_indices = similarities.argsort(descending=True)[:5]  # Get top 5 most similar documents

    # Return a list of tuples with document indices and similarity scores
    return [(i, similarities[i].item()) for i in top_indices]

ModuleNotFoundError: No module named 'fastText'

In [None]:
# Example usage

text_data = ["This is an example document about machine learning.",
             "This document discusses natural language processing techniques.",
             "Here's another document on artificial intelligence."]

# Choose pre-trained models based on your requirements (e.g., language, domain)
model_path = "Model/cc.en.300.bin.gz"
sbert_model_name = "all-mpnet-base-v2"  # Example SBERT model name

# Generate FastText embeddings (optional)
fasttext_embeddings = generate_fasttext_embeddings(text_data, model_path)

query = "What are the latest advancements in natural language processing?"

# Perform semantic search with both SBERT and optionally FastText
results = perform_semantic_search(query, text_data, sbert_model_name, fasttext_embeddings)

print("Top 5 most similar documents to the query:")
for index, similarity_score in results:
    print(f"\t- Document {index + 1} (Similarity score: {similarity_score:.4f})")
